[llvm] [RegAlloc] Scale the spill weight by the weight of register class (PR #113675)

Pengcheng Wang via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 5 21:55:11 PST 2024


https://github.com/wangpc-pp updated https://github.com/llvm/llvm-project/pull/113675

>From 92eac5d2cbd39a2a87d38d984a4ac4c9c082f544 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Fri, 25 Oct 2024 18:23:03 +0800
Subject: [PATCH 1/2] [RegAlloc][RISCV] Increase the spill weight by target
 factor

Currently, the spill weight is only determined by isDef/isUse and
block frequency. However, for registers with different register
classes, the costs of spilling them are different.

For example, for `LMUL>1` registers (in which, several physical regsiter
compound a bigger logical register), the costs are larger than
`LMUL=1` case (in which, there is only one physical register).

To sovle this problem, a new target hook `getSpillWeightFactor` is
added. Targets can override the default factor (which is 1) according
to the register classes.

For RISC-V, the factors are set to the `RegClassWeight` which is
used to track regsiter pressure. The values of `RegClassWeight`
are the number of register units.

I believe all the targets can benefit from this change, but I will
shrink the range of tests to RISC-V only.

Partially fixes #113489.
---
 llvm/include/llvm/CodeGen/LiveIntervals.h     |    4 +-
 .../include/llvm/CodeGen/TargetRegisterInfo.h |    4 +
 llvm/lib/CodeGen/CalcSpillWeights.cpp         |   12 +-
 llvm/lib/CodeGen/LiveIntervals.cpp            |    8 +-
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       |    5 +
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |    5 +
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |    3 +
 llvm/test/CodeGen/RISCV/rvv/abs-vp.ll         |   37 +-
 llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll  |  136 +-
 llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll       |   94 +-
 llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll        |   61 +-
 llvm/test/CodeGen/RISCV/rvv/compressstore.ll  |   38 +-
 llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll       |  207 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll        |  311 +--
 llvm/test/CodeGen/RISCV/rvv/expandload.ll     |   97 +-
 .../RISCV/rvv/fixed-vectors-bitreverse-vp.ll  |  226 +--
 .../RISCV/rvv/fixed-vectors-bswap-vp.ll       |   96 +-
 .../RISCV/rvv/fixed-vectors-ceil-vp.ll        |   58 +-
 .../RISCV/rvv/fixed-vectors-ctlz-vp.ll        |  628 ++----
 .../RISCV/rvv/fixed-vectors-ctpop-vp.ll       |  182 +-
 .../RISCV/rvv/fixed-vectors-cttz-vp.ll        |  404 ++--
 .../RISCV/rvv/fixed-vectors-floor-vp.ll       |   58 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |   59 +-
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |   59 +-
 .../rvv/fixed-vectors-int-explodevector.ll    |   26 +-
 .../rvv/fixed-vectors-interleaved-access.ll   |  632 +++---
 .../RISCV/rvv/fixed-vectors-nearbyint-vp.ll   |   16 -
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |  296 ++-
 .../RISCV/rvv/fixed-vectors-rint-vp.ll        |   16 -
 .../RISCV/rvv/fixed-vectors-round-vp.ll       |   58 +-
 .../RISCV/rvv/fixed-vectors-roundeven-vp.ll   |   58 +-
 .../RISCV/rvv/fixed-vectors-roundtozero-vp.ll |   58 +-
 .../RISCV/rvv/fixed-vectors-setcc-fp-vp.ll    |  624 +++---
 .../RISCV/rvv/fixed-vectors-setcc-int-vp.ll   |   58 +-
 .../RISCV/rvv/fixed-vectors-trunc-vp.ll       |  148 +-
 .../RISCV/rvv/fixed-vectors-vcopysign-vp.ll   |   28 +-
 .../RISCV/rvv/fixed-vectors-vfmax-vp.ll       |   28 +-
 .../RISCV/rvv/fixed-vectors-vfmin-vp.ll       |   28 +-
 .../RISCV/rvv/fixed-vectors-vselect-vp.ll     |   84 +-
 llvm/test/CodeGen/RISCV/rvv/floor-vp.ll       |   61 +-
 .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll |   38 +-
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    |  243 +--
 .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll |   38 +-
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    |  243 +--
 llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll   |  286 +--
 .../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll |   82 +-
 llvm/test/CodeGen/RISCV/rvv/rint-vp.ll        |   45 -
 llvm/test/CodeGen/RISCV/rvv/round-vp.ll       |   61 +-
 llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll   |   61 +-
 llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll |   61 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll    |  286 ++-
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll   |   60 +-
 .../test/CodeGen/RISCV/rvv/strided-vpstore.ll |   30 +-
 .../RISCV/rvv/vector-deinterleave-load.ll     |   51 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |   90 +-
 llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll       |  184 +-
 llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll       |  184 +-
 llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll        | 1754 ++++++++---------
 .../RISCV/rvv/vfmadd-constrained-sdnode.ll    |  110 +-
 llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll  |  306 +--
 .../RISCV/rvv/vfmsub-constrained-sdnode.ll    |   17 +-
 llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll       |   92 +-
 llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll    |   26 +-
 llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll       |  184 +-
 llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll     |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll    |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll    |   24 +-
 llvm/test/CodeGen/RISCV/rvv/vpstore.ll        |   30 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll     |    9 +-
 llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll     |   96 +-
 llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll      |    9 +-
 71 files changed, 3637 insertions(+), 6122 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
index 161bb247a0e968..a58ba178ac8484 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -117,14 +117,14 @@ class LiveIntervals {
   /// If \p PSI is provided the calculation is altered for optsize functions.
   static float getSpillWeight(bool isDef, bool isUse,
                               const MachineBlockFrequencyInfo *MBFI,
-                              const MachineInstr &MI,
+                              const MachineInstr &MI, unsigned Factor = 1,
                               ProfileSummaryInfo *PSI = nullptr);
 
   /// Calculate the spill weight to assign to a single instruction.
   /// If \p PSI is provided the calculation is altered for optsize functions.
   static float getSpillWeight(bool isDef, bool isUse,
                               const MachineBlockFrequencyInfo *MBFI,
-                              const MachineBasicBlock *MBB,
+                              const MachineBasicBlock *MBB, unsigned Factor = 1,
                               ProfileSummaryInfo *PSI = nullptr);
 
   LiveInterval &getInterval(Register Reg) {
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 292fa3c94969be..7dd272fed996d1 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -926,6 +926,10 @@ class TargetRegisterInfo : public MCRegisterInfo {
   /// Returns a -1 terminated array of pressure set IDs.
   virtual const int *getRegUnitPressureSets(unsigned RegUnit) const = 0;
 
+  /// Get the factor of spill weight for this register class.
+  virtual unsigned
+  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const;
+
   /// Get a list of 'hint' registers that the register allocator should try
   /// first when allocating a physical register for the virtual register
   /// VirtReg. These registers are effectively moved to the front of the
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 6a9dc60584a332..3bc0159fefd02b 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -188,6 +188,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
   // Do not update future local split artifacts.
   bool ShouldUpdateLI = !IsLocalSplitArtifact;
 
+  unsigned Factor = TRI.getSpillWeightScaleFactor(MRI.getRegClass(LI.reg()));
   if (IsLocalSplitArtifact) {
     MachineBasicBlock *LocalMBB = LIS.getMBBFromIndex(*End);
     assert(LocalMBB == LIS.getMBBFromIndex(*Start) &&
@@ -198,10 +199,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
     // localLI = COPY other
     // ...
     // other   = COPY localLI
-    TotalWeight +=
-        LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB, PSI);
-    TotalWeight +=
-        LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB, PSI);
+    TotalWeight += LiveIntervals::getSpillWeight(true, false, &MBFI, LocalMBB,
+                                                 Factor, PSI);
+    TotalWeight += LiveIntervals::getSpillWeight(false, true, &MBFI, LocalMBB,
+                                                 Factor, PSI);
 
     NumInstr += 2;
   }
@@ -271,7 +272,8 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       // Calculate instr weight.
       bool Reads, Writes;
       std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
-      Weight = LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI, PSI);
+      Weight =
+          LiveIntervals::getSpillWeight(Writes, Reads, &MBFI, *MI, Factor, PSI);
 
       // Give extra weight to what looks like a loop induction variable update.
       if (Writes && IsExiting && LIS.isLiveOutOfMBB(LI, MBB))
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index f9ee6e4563f8d6..e3ecc46b10366a 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -886,22 +886,22 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
 
 float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
                                     const MachineBlockFrequencyInfo *MBFI,
-                                    const MachineInstr &MI,
+                                    const MachineInstr &MI, unsigned Factor,
                                     ProfileSummaryInfo *PSI) {
-  return getSpillWeight(isDef, isUse, MBFI, MI.getParent(), PSI);
+  return getSpillWeight(isDef, isUse, MBFI, MI.getParent(), Factor, PSI);
 }
 
 float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
                                     const MachineBlockFrequencyInfo *MBFI,
                                     const MachineBasicBlock *MBB,
-                                    ProfileSummaryInfo *PSI) {
+                                    unsigned Factor, ProfileSummaryInfo *PSI) {
   float Weight = isDef + isUse;
   const auto *MF = MBB->getParent();
   // When optimizing for size we only consider the codesize impact of spilling
   // the register, not the runtime impact.
   if (PSI && llvm::shouldOptimizeForSize(MF, PSI, MBFI))
     return Weight;
-  return Weight * MBFI->getBlockFreqRelativeToEntryBlock(MBB);
+  return Weight * MBFI->getBlockFreqRelativeToEntryBlock(MBB) * Factor;
 }
 
 LiveRange::Segment
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 032f1a33e75c43..93c59cb134d8c0 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -414,6 +414,11 @@ bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
   return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
 }
 
+unsigned TargetRegisterInfo::getSpillWeightScaleFactor(
+    const TargetRegisterClass *RC) const {
+  return 1;
+}
+
 // Compute target-independent register allocator hints to help eliminate copies.
 bool TargetRegisterInfo::getRegAllocationHints(
     Register VirtReg, ArrayRef<MCPhysReg> Order,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index cfcc3119257f65..9e02f1ecc60cde 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -805,6 +805,11 @@ RISCVRegisterInfo::getRegisterCostTableIndex(const MachineFunction &MF) const {
              : 0;
 }
 
+unsigned RISCVRegisterInfo::getSpillWeightScaleFactor(
+    const TargetRegisterClass *RC) const {
+  return getRegClassWeight(RC).RegWeight;
+}
+
 // Add two address hints to improve chances of being able to use a compressed
 // instruction.
 bool RISCVRegisterInfo::getRegAllocationHints(
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 3ab79694e175c8..9b4317873fec61 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -127,6 +127,9 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   unsigned getRegisterCostTableIndex(const MachineFunction &MF) const override;
 
+  unsigned
+  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override;
+
   bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
                              const MachineFunction &MF, const VirtRegMap *VRM,
diff --git a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
index 163d9145bc3623..d52217d8fe2474 100644
--- a/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/abs-vp.ll
@@ -561,18 +561,7 @@ declare <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64>, i1 immarg
 define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_abs_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
@@ -582,30 +571,16 @@ define <vscale x 16 x i64> @vp_abs_nxv16i64(<vscale x 16 x i64> %va, <vscale x 1
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
-; CHECK-NEXT:    vmax.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
+; CHECK-NEXT:    vmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
-; CHECK-NEXT:    vmax.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
+; CHECK-NEXT:    vmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i64> @llvm.vp.abs.nxv16i64(<vscale x 16 x i64> %va, i1 false, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
index 66a1178cddb66c..6b5f5d786b60c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bitreverse-vp.ll
@@ -2311,35 +2311,37 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    slli a5, a5, 3
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
@@ -2353,7 +2355,7 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -2371,7 +2373,7 @@ define <vscale x 7 x i64> @vp_bitreverse_nxv7i64(<vscale x 7 x i64> %va, <vscale
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -2700,35 +2702,37 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a5, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a5) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
 ; RV32-NEXT:    slli a5, a5, 4
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
+; RV32-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a5, vlenb
+; RV32-NEXT:    slli a5, a5, 3
+; RV32-NEXT:    add a5, sp, a5
+; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    slli a5, a5, 4
+; RV32-NEXT:    slli a5, a5, 3
 ; RV32-NEXT:    add a5, sp, a5
 ; RV32-NEXT:    addi a5, a5, 16
 ; RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
@@ -2742,7 +2746,7 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -2760,7 +2764,7 @@ define <vscale x 8 x i64> @vp_bitreverse_nxv8i64(<vscale x 8 x i64> %va, <vscale
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -3069,18 +3073,7 @@ declare <vscale x 64 x i16> @llvm.vp.bitreverse.nxv64i16(<vscale x 64 x i16>, <v
 define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bitreverse_nxv64i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    lui a1, 1
 ; CHECK-NEXT:    lui a2, 3
@@ -3097,63 +3090,48 @@ define <vscale x 64 x i16> @vp_bitreverse_nxv64i16(<vscale x 64 x i16> %va, <vsc
 ; CHECK-NEXT:    addi a2, a2, 819
 ; CHECK-NEXT:    addi a1, a6, 1365
 ; CHECK-NEXT:    vsetvli zero, a5, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a4, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsll.vi v16, v16, 1, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    bltu a0, a3, .LBB46_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a3
 ; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a4, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a1, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_bitreverse_nxv64i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
index 1c95ec8fafd4f1..ca783060cfe641 100644
--- a/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll
@@ -1048,35 +1048,37 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -1090,7 +1092,7 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1100,7 +1102,7 @@ define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1323,35 +1325,37 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -1365,7 +1369,7 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1375,7 +1379,7 @@ define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1578,18 +1582,7 @@ declare <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16>, <vscale
 define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bswap_nxv64i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    slli a1, a1, 2
@@ -1600,33 +1593,18 @@ define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB32_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB32_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16:
diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
index 7d0b0118a72725..2ad48c3a70100c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll
@@ -273,12 +273,6 @@ declare <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat>, <
 define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -302,11 +296,7 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -332,12 +322,6 @@ define <vscale x 32 x bfloat> @vp_ceil_vv_nxv32bf16(<vscale x 32 x bfloat> %va,
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.ceil.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -840,12 +824,6 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ;
 ; ZVFHMIN-LABEL: vp_ceil_vv_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -869,11 +847,7 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -899,12 +873,6 @@ define <vscale x 32 x half> @vp_ceil_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.ceil.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1419,12 +1387,6 @@ declare <vscale x 16 x double> @llvm.vp.ceil.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_vv_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
@@ -1445,40 +1407,27 @@ define <vscale x 16 x double> @vp_ceil_vv_nxv16f64(<vscale x 16 x double> %va, <
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.ceil.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
index a407cd048ffe3f..49e54032c1de61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/compressstore.ll
@@ -224,55 +224,39 @@ define void @test_compresstore_v256i8(ptr %p, <256 x i1> %mask, <256 x i8> %data
 ;
 ; RV32-LABEL: test_compresstore_v256i8:
 ; RV32:       # %bb.0: # %entry
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v16
+; RV32-NEXT:    vmv1r.v v7, v8
 ; RV32-NEXT:    li a2, 128
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vslidedown.vi v9, v0, 1
 ; RV32-NEXT:    li a3, 32
 ; RV32-NEXT:    vmv.x.s a4, v0
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vle8.v v16, (a1)
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle8.v v24, (a1)
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v10, v9, a3
+; RV32-NEXT:    vsrl.vx v6, v9, a3
 ; RV32-NEXT:    vmv.x.s a1, v9
-; RV32-NEXT:    vsrl.vx v9, v0, a3
+; RV32-NEXT:    vsrl.vx v5, v0, a3
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v16, v24, v0
+; RV32-NEXT:    vcompress.vm v8, v16, v0
 ; RV32-NEXT:    vcpop.m a3, v0
 ; RV32-NEXT:    cpop a4, a4
 ; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; RV32-NEXT:    vmv.x.s a5, v10
-; RV32-NEXT:    vmv.x.s a6, v9
+; RV32-NEXT:    vmv.x.s a5, v6
+; RV32-NEXT:    vmv.x.s a6, v5
 ; RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; RV32-NEXT:    vse8.v v16, (a0)
+; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    cpop a1, a1
 ; RV32-NEXT:    cpop a3, a6
 ; RV32-NEXT:    cpop a5, a5
 ; RV32-NEXT:    add a3, a4, a3
 ; RV32-NEXT:    add a1, a1, a5
 ; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    addi a3, sp, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; RV32-NEXT:    vcompress.vm v16, v24, v8
+; RV32-NEXT:    vcompress.vm v8, v24, v7
 ; RV32-NEXT:    add a0, a0, a1
-; RV32-NEXT:    vcpop.m a1, v8
+; RV32-NEXT:    vcpop.m a1, v7
 ; RV32-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
-; RV32-NEXT:    vse8.v v16, (a0)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    vse8.v v8, (a0)
 ; RV32-NEXT:    ret
 entry:
   tail call void @llvm.masked.compressstore.v256i8(<256 x i8> %data, ptr align 1 %p, <256 x i1> %mask)
diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
index 9e75dc9dccffde..d3929e936d1542 100644
--- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll
@@ -2024,8 +2024,7 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -2038,88 +2037,60 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi a2, a2, 1365
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a2
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    sltu a2, a0, a3
 ; RV32-NEXT:    addi a2, a2, -1
 ; RV32-NEXT:    and a2, a2, a3
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    lui a3, 209715
 ; RV32-NEXT:    addi a3, a3, 819
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a3
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
+; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    addi a3, a3, -241
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    lui a3, 4112
 ; RV32-NEXT:    addi a3, a3, 257
 ; RV32-NEXT:    vsetvli a4, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a3
+; RV32-NEXT:    vmv.v.x v8, a3
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    li a2, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; RV32-NEXT:    vsrl.vx v8, v16, a2, v0.t
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
@@ -2127,60 +2098,32 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
 ; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
-; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v16, v8
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v24, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v24, 2, v0.t
+; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
@@ -2207,28 +2150,17 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ;
 ; RV64-LABEL: vp_ctpop_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    srli a2, a1, 3
 ; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a2
+; RV64-NEXT:    vslidedown.vx v7, v0, a2
 ; RV64-NEXT:    mv a2, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB46_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a2, a1
 ; RV64-NEXT:  .LBB46_2:
 ; RV64-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    lui a3, 209715
 ; RV64-NEXT:    lui a4, 61681
@@ -2238,58 +2170,43 @@ define <vscale x 16 x i64> @vp_ctpop_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    addiw a4, a4, -241
 ; RV64-NEXT:    addiw a5, a5, 257
 ; RV64-NEXT:    slli a6, a2, 32
-; RV64-NEXT:    add a6, a2, a6
-; RV64-NEXT:    slli a2, a3, 32
-; RV64-NEXT:    add a7, a3, a2
-; RV64-NEXT:    slli a2, a4, 32
-; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    slli a3, a5, 32
-; RV64-NEXT:    add a3, a5, a3
-; RV64-NEXT:    li a4, 56
+; RV64-NEXT:    add a2, a2, a6
+; RV64-NEXT:    slli a6, a3, 32
+; RV64-NEXT:    add a3, a3, a6
+; RV64-NEXT:    slli a6, a4, 32
+; RV64-NEXT:    add a4, a4, a6
+; RV64-NEXT:    slli a6, a5, 32
+; RV64-NEXT:    add a5, a5, a6
+; RV64-NEXT:    li a6, 56
 ; RV64-NEXT:    sub a1, a0, a1
 ; RV64-NEXT:    sltu a0, a0, a1
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a7, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a7, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a4, v0.t
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a5, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a6, v0.t
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a7, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a2, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a3, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a4, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a6, v0.t
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_ctpop_nxv16i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
index 9e6295b6644171..5d2327f60534c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll
@@ -2242,11 +2242,11 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
-; RV32-NEXT:    vmv1r.v v24, v0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; RV32-NEXT:    vmv1r.v v7, v0
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
@@ -2264,226 +2264,115 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV32-NEXT:    lui a4, 349525
 ; RV32-NEXT:    addi a4, a4, 1365
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV32-NEXT:    vsub.vx v24, v16, a2, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a5, vlenb
-; RV32-NEXT:    li a6, 48
-; RV32-NEXT:    mul a5, a5, a6
-; RV32-NEXT:    add a5, sp, a5
-; RV32-NEXT:    addi a5, a5, 16
-; RV32-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 40
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 40
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vand.vv v24, v24, v8, v0.t
+; RV32-NEXT:    vsub.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    lui a4, 209715
 ; RV32-NEXT:    addi a4, a4, 819
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    li a5, 24
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 48
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV32-NEXT:    vadd.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    lui a4, 61681
 ; RV32-NEXT:    addi a4, a4, -241
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
+; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    lui a4, 4112
 ; RV32-NEXT:    addi a4, a4, 257
 ; RV32-NEXT:    vsetvli a5, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v16, a4
+; RV32-NEXT:    vmv.v.x v8, a4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
-; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v16, v16, v8, v0.t
 ; RV32-NEXT:    li a3, 56
-; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
+; RV32-NEXT:    vsrl.vx v8, v16, a3, v0.t
+; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    bltu a0, a1, .LBB46_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a0, a1
 ; RV32-NEXT:  .LBB46_2:
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV32-NEXT:    vsub.vx v24, v8, a2, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v16, v8
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
@@ -2493,18 +2382,7 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ;
 ; RV64-LABEL: vp_cttz_nxv16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    vmv1r.v v24, v0
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v7, v0
 ; RV64-NEXT:    csrr a1, vlenb
 ; RV64-NEXT:    li a2, 1
 ; RV64-NEXT:    lui a3, 349525
@@ -2532,56 +2410,42 @@ define <vscale x 16 x i64> @vp_cttz_nxv16i64(<vscale x 16 x i64> %va, <vscale x
 ; RV64-NEXT:    and t0, a5, t0
 ; RV64-NEXT:    li a5, 56
 ; RV64-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v8, v16, a2, v0.t
+; RV64-NEXT:    vsub.vx v24, v16, a2, v0.t
 ; RV64-NEXT:    vnot.v v16, v16, v0.t
-; RV64-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    addi t0, sp, 16
-; RV64-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a7, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a6, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    bltu a0, a1, .LBB46_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a0, a1
 ; RV64-NEXT:  .LBB46_2:
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a2, v0.t
+; RV64-NEXT:    vsub.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a7, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a7, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a6, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_nxv16i64:
@@ -3996,18 +3860,7 @@ define <vscale x 8 x i64> @vp_cttz_zero_undef_nxv8i64_unmasked(<vscale x 8 x i64
 define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_cttz_zero_undef_nxv16i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    fsrmi a3, 1
 ; CHECK-NEXT:    srli a2, a1, 3
@@ -4019,40 +3872,26 @@ define <vscale x 16 x i64> @vp_cttz_zero_undef_nxv16i64(<vscale x 16 x i64> %va,
 ; CHECK-NEXT:    and a4, a2, a4
 ; CHECK-NEXT:    li a2, 52
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v8, v16, 0, v0.t
-; CHECK-NEXT:    vand.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
+; CHECK-NEXT:    vrsub.vi v24, v16, 0, v0.t
+; CHECK-NEXT:    vand.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfcvt.f.xu.v v16, v16, v0.t
 ; CHECK-NEXT:    fsrm a3
-; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    vsrl.vx v16, v16, a2, v0.t
 ; CHECK-NEXT:    li a3, 1023
-; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    addi a4, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsub.vx v16, v16, a3, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB94_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB94_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vrsub.vi v16, v8, 0, v0.t
+; CHECK-NEXT:    vrsub.vi v24, v8, 0, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vand.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vfcvt.f.xu.v v8, v8, v0.t
 ; CHECK-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsub.vx v8, v8, a3, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-ZVBB-LABEL: vp_cttz_zero_undef_nxv16i64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/expandload.ll b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
index b32d85bb1943a5..7b970f7152237c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/expandload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/expandload.ll
@@ -230,7 +230,7 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV32-NEXT:    li a2, 128
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV32-NEXT:    vslidedown.vi v8, v0, 1
 ; CHECK-RV32-NEXT:    li a3, 32
 ; CHECK-RV32-NEXT:    vmv.x.s a4, v0
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
@@ -241,15 +241,15 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v10, v9, a3
-; CHECK-RV32-NEXT:    vsrl.vx v11, v0, a3
-; CHECK-RV32-NEXT:    vmv.x.s a1, v9
+; CHECK-RV32-NEXT:    vsrl.vx v9, v8, a3
+; CHECK-RV32-NEXT:    vsrl.vx v10, v0, a3
+; CHECK-RV32-NEXT:    vmv.x.s a1, v8
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a3, v0
 ; CHECK-RV32-NEXT:    cpop a4, a4
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a5, v10
-; CHECK-RV32-NEXT:    vmv.x.s a6, v11
+; CHECK-RV32-NEXT:    vmv.x.s a5, v9
+; CHECK-RV32-NEXT:    vmv.x.s a6, v10
 ; CHECK-RV32-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle8.v v8, (a0)
 ; CHECK-RV32-NEXT:    csrr a3, vlenb
@@ -290,12 +290,7 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV32-NEXT:    viota.m v16, v7
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
@@ -304,11 +299,6 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    li a1, 24
@@ -341,7 +331,7 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    vmv1r.v v7, v8
 ; CHECK-RV64-NEXT:    li a2, 128
 ; CHECK-RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vslidedown.vi v9, v0, 1
+; CHECK-RV64-NEXT:    vslidedown.vi v8, v0, 1
 ; CHECK-RV64-NEXT:    vmv.x.s a3, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vle8.v v16, (a1)
@@ -351,7 +341,7 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    addi a1, a1, 16
 ; CHECK-RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
-; CHECK-RV64-NEXT:    vmv.x.s a1, v9
+; CHECK-RV64-NEXT:    vmv.x.s a1, v8
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-RV64-NEXT:    vcpop.m a4, v0
 ; CHECK-RV64-NEXT:    vsetvli zero, a4, e8, m8, ta, ma
@@ -372,31 +362,26 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV64-NEXT:    vsetvli zero, a2, e8, m8, ta, mu
-; CHECK-RV64-NEXT:    viota.m v24, v0
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    li a1, 24
-; CHECK-RV64-NEXT:    mul a0, a0, a1
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    viota.m v16, v0
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 4
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    vrgather.vv v8, v16, v24, v0.t
+; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
 ; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-RV64-NEXT:    viota.m v16, v7
+; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV64-NEXT:    vrgather.vv v8, v24, v16, v0.t
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
+; CHECK-RV64-NEXT:    li a1, 24
+; CHECK-RV64-NEXT:    mul a0, a0, a1
 ; CHECK-RV64-NEXT:    add a0, sp, a0
 ; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-RV64-NEXT:    viota.m v8, v7
 ; CHECK-RV64-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    slli a0, a0, 3
@@ -405,11 +390,6 @@ define <256 x i8> @test_expandload_v256i8(ptr %base, <256 x i1> %mask, <256 x i8
 ; CHECK-RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    addi a0, sp, 16
 ; CHECK-RV64-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV64-NEXT:    csrr a0, vlenb
-; CHECK-RV64-NEXT:    slli a0, a0, 4
-; CHECK-RV64-NEXT:    add a0, sp, a0
-; CHECK-RV64-NEXT:    addi a0, a0, 16
-; CHECK-RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-RV64-NEXT:    vrgather.vv v16, v24, v8, v0.t
 ; CHECK-RV64-NEXT:    csrr a0, vlenb
 ; CHECK-RV64-NEXT:    li a1, 24
@@ -664,12 +644,12 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    addi sp, sp, -16
 ; CHECK-RV32-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-RV32-NEXT:    csrr a1, vlenb
-; CHECK-RV32-NEXT:    slli a1, a1, 5
-; CHECK-RV32-NEXT:    sub sp, sp, a1
-; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-RV32-NEXT:    csrr a1, vlenb
 ; CHECK-RV32-NEXT:    li a2, 24
 ; CHECK-RV32-NEXT:    mul a1, a1, a2
+; CHECK-RV32-NEXT:    sub sp, sp, a1
+; CHECK-RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-RV32-NEXT:    csrr a1, vlenb
+; CHECK-RV32-NEXT:    slli a1, a1, 4
 ; CHECK-RV32-NEXT:    add a1, sp, a1
 ; CHECK-RV32-NEXT:    addi a1, a1, 16
 ; CHECK-RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -682,58 +662,49 @@ define <128 x i16> @test_expandload_v128i16(ptr %base, <128 x i1> %mask, <128 x
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a4, v0
 ; CHECK-RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vsrl.vx v25, v0, a2
+; CHECK-RV32-NEXT:    vsrl.vx v6, v0, a2
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
 ; CHECK-RV32-NEXT:    vcpop.m a2, v7
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
 ; CHECK-RV32-NEXT:    vle16.v v16, (a0)
-; CHECK-RV32-NEXT:    csrr a5, vlenb
-; CHECK-RV32-NEXT:    slli a5, a5, 4
-; CHECK-RV32-NEXT:    add a5, sp, a5
-; CHECK-RV32-NEXT:    addi a5, a5, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a4, e64, m1, ta, ma
-; CHECK-RV32-NEXT:    vmv.x.s a4, v25
+; CHECK-RV32-NEXT:    vmv.x.s a4, v6
 ; CHECK-RV32-NEXT:    cpop a4, a4
 ; CHECK-RV32-NEXT:    cpop a3, a3
 ; CHECK-RV32-NEXT:    add a3, a3, a4
 ; CHECK-RV32-NEXT:    slli a3, a3, 1
 ; CHECK-RV32-NEXT:    add a0, a0, a3
 ; CHECK-RV32-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-RV32-NEXT:    vle16.v v16, (a0)
+; CHECK-RV32-NEXT:    vle16.v v24, (a0)
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
-; CHECK-RV32-NEXT:    viota.m v16, v0
-; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 4
-; CHECK-RV32-NEXT:    add a0, sp, a0
-; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v8, v24, v16, v0.t
+; CHECK-RV32-NEXT:    viota.m v24, v0
+; CHECK-RV32-NEXT:    vrgather.vv v8, v16, v24, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-RV32-NEXT:    viota.m v8, v7
 ; CHECK-RV32-NEXT:    vmv1r.v v0, v7
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    li a1, 24
-; CHECK-RV32-NEXT:    mul a0, a0, a1
+; CHECK-RV32-NEXT:    slli a0, a0, 4
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
 ; CHECK-RV32-NEXT:    slli a0, a0, 3
 ; CHECK-RV32-NEXT:    add a0, sp, a0
 ; CHECK-RV32-NEXT:    addi a0, a0, 16
-; CHECK-RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-RV32-NEXT:    vrgather.vv v16, v24, v8, v0.t
+; CHECK-RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vrgather.vv v24, v16, v8, v0.t
 ; CHECK-RV32-NEXT:    addi a0, sp, 16
 ; CHECK-RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-RV32-NEXT:    vmv.v.v v16, v24
 ; CHECK-RV32-NEXT:    csrr a0, vlenb
-; CHECK-RV32-NEXT:    slli a0, a0, 5
+; CHECK-RV32-NEXT:    li a1, 24
+; CHECK-RV32-NEXT:    mul a0, a0, a1
 ; CHECK-RV32-NEXT:    add sp, sp, a0
 ; CHECK-RV32-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
index 3eb5d36b4896a7..9305879147d320 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll
@@ -1659,7 +1659,6 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v8
 ; RV32-NEXT:    lui a2, 1044480
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    lui a4, 209715
@@ -1682,58 +1681,60 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi a5, a6, -256
 ; RV32-NEXT:    sw a4, 24(sp)
 ; RV32-NEXT:    sw a4, 28(sp)
-; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v24, a1, v0.t
-; RV32-NEXT:    vsrl.vx v8, v24, a2, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -1743,7 +1744,7 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -2055,7 +2056,6 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
 ; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 24 * vlenb
-; RV32-NEXT:    vmv8r.v v24, v8
 ; RV32-NEXT:    lui a2, 1044480
 ; RV32-NEXT:    lui a3, 61681
 ; RV32-NEXT:    lui a4, 209715
@@ -2078,58 +2078,60 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    addi a5, a6, -256
 ; RV32-NEXT:    sw a4, 24(sp)
 ; RV32-NEXT:    sw a4, 28(sp)
-; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV32-NEXT:    vsll.vx v8, v8, a2, v0.t
-; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vand.vx v24, v8, a5, v0.t
+; RV32-NEXT:    vsll.vx v24, v24, a2, v0.t
+; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v16, v24, a3, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 24, v0.t
-; RV32-NEXT:    addi a4, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    add a4, sp, a4
+; RV32-NEXT:    addi a4, a4, 48
+; RV32-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a4, vlenb
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 48
 ; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vx v16, v24, a1, v0.t
-; RV32-NEXT:    vsrl.vx v8, v24, a2, v0.t
-; RV32-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
+; RV32-NEXT:    vsrl.vx v24, v8, a2, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v24, 24, v0.t
-; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
-; RV32-NEXT:    vsrl.vi v8, v24, 8, v0.t
+; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
+; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    addi a1, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
@@ -2139,7 +2141,7 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex
 ; RV32-NEXT:    vlse64.v v24, (a1), zero
 ; RV32-NEXT:    addi a1, sp, 24
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
@@ -2444,27 +2446,16 @@ declare <128 x i16> @llvm.vp.bitreverse.v128i16(<128 x i16>, <128 x i1>, i32)
 define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bitreverse_v128i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
+; CHECK-NEXT:    vslidedown.vi v7, v0, 8
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB34_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:  .LBB34_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    lui a1, 1
 ; CHECK-NEXT:    lui a2, 3
 ; CHECK-NEXT:    addi a3, a0, -64
@@ -2473,60 +2464,45 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    and a3, a0, a3
 ; CHECK-NEXT:    lui a0, 5
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    addi a4, a1, -241
-; CHECK-NEXT:    addi a1, a2, 819
+; CHECK-NEXT:    addi a1, a1, -241
+; CHECK-NEXT:    addi a2, a2, 819
 ; CHECK-NEXT:    addi a0, a0, 1365
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a1, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
+; CHECK-NEXT:    vand.vx v8, v8, a2, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; CHECK-NEXT:    vor.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a4, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a4, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 4, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 2, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a1, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a1, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 2, v0.t
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 4, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a2, v0.t
+; CHECK-NEXT:    vand.vx v16, v16, a2, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 2, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a0, v0.t
 ; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vor.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vsll.vi v16, v16, 1, v0.t
+; CHECK-NEXT:    vor.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <128 x i16> @llvm.vp.bitreverse.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl)
   ret <128 x i16> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
index d765e4c0b8f6a9..5f382c3fdc834e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll
@@ -782,35 +782,37 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -824,7 +826,7 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -834,7 +836,7 @@ define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1045,35 +1047,37 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
 ; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 4
+; RV32-NEXT:    slli a3, a3, 3
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a5), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a3, 4080
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
-; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
-; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vand.vx v16, v8, a3, v0.t
+; RV32-NEXT:    vsll.vi v24, v16, 24, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
+; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
@@ -1087,7 +1091,7 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
+; RV32-NEXT:    slli a0, a0, 4
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1097,7 +1101,7 @@ define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
+; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 16
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -1287,53 +1291,27 @@ declare <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16>, <128 x i1>, i32)
 define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_bswap_v128i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a2, 64
 ; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 8
+; CHECK-NEXT:    vslidedown.vi v7, v0, 8
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 64
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
+; CHECK-NEXT:    vsrl.vi v24, v8, 8, v0.t
 ; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a1, a0, -64
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
-; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
+; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    ret
   %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl)
   ret <128 x i16> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
index 511242aa677c2a..4b85a95f633ba4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll
@@ -737,29 +737,18 @@ declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_ceil_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -767,43 +756,28 @@ define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 3
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 3
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.ceil.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
index 9d0d42cf754c5e..4f7f5b83ae3902 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll
@@ -2050,17 +2050,17 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
@@ -2108,129 +2108,57 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a4), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vlse64.v v16, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a4), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -2242,7 +2170,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
@@ -2262,86 +2190,34 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
@@ -2351,7 +2227,7 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -2361,27 +2237,16 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ;
 ; RV64-LABEL: vp_ctlz_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    li a1, 32
 ; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    lui a3, 209715
@@ -2404,72 +2269,57 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a6, a0, a6
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 16, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 3
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 16, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a4, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
@@ -4756,17 +4606,17 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi sp, sp, -48
 ; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 56
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    sub sp, sp, a1
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 56 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 48 * vlenb
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v0, 2
+; RV32-NEXT:    vslidedown.vi v7, v0, 2
 ; RV32-NEXT:    lui a1, 349525
 ; RV32-NEXT:    lui a2, 209715
 ; RV32-NEXT:    addi a1, a1, 1365
@@ -4814,129 +4664,57 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vnot.v v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a4), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
+; RV32-NEXT:    vlse64.v v16, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 40
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
 ; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 48
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a3, a3, a5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a4), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v16, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -4948,7 +4726,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    sltu a0, a0, a3
 ; RV32-NEXT:    addi a0, a0, -1
 ; RV32-NEXT:    and a0, a0, a3
-; RV32-NEXT:    vmv1r.v v0, v24
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    slli a3, a3, 4
 ; RV32-NEXT:    add a3, sp, a3
@@ -4968,86 +4746,34 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 48
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 4
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
+; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
@@ -5057,7 +4783,7 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 56
+; RV32-NEXT:    li a1, 48
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 48
@@ -5067,27 +4793,16 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ;
 ; RV64-LABEL: vp_ctlz_zero_undef_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB70_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    li a1, 32
 ; RV64-NEXT:    lui a2, 349525
 ; RV64-NEXT:    lui a3, 209715
@@ -5110,72 +4825,57 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    addi a0, a0, -1
 ; RV64-NEXT:    and a6, a0, a6
 ; RV64-NEXT:    li a0, 56
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 2, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 8, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 16, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vx v24, v8, a1, v0.t
+; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a4, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    addi a7, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a7) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a7, vlenb
-; RV64-NEXT:    slli a7, a7, 3
-; RV64-NEXT:    add a7, sp, a7
-; RV64-NEXT:    addi a7, a7, 16
-; RV64-NEXT:    vl8r.v v8, (a7) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vor.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v8, v16, 2, v0.t
-; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 8, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 16, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a4, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a4, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 2, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 8, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 16, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a5, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a4, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctlz.v32i64(<32 x i64> %va, i1 true, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
index 5e73e6df9170c2..82f6deb85c44d6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll
@@ -1565,75 +1565,53 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, sp, 32
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 5
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a2, sp, 24
 ; RV32-NEXT:    addi a3, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a2), zero
-; RV32-NEXT:    addi a2, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a3), zero
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a3), zero
+; RV32-NEXT:    addi a2, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a1, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
@@ -1652,35 +1630,35 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV32-NEXT:    addi a2, a2, 48
 ; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v8, v16, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    vsub.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a0, a0, a2
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
 ; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
 ; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    addi a0, sp, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a2, 24
-; RV32-NEXT:    mul a0, a0, a2
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi a0, sp, 48
+; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1699,27 +1677,16 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ;
 ; RV64-LABEL: vp_ctpop_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a1, a0
 ; RV64-NEXT:    bltu a0, a2, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:  .LBB34_2:
 ; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
 ; RV64-NEXT:    lui a1, 349525
 ; RV64-NEXT:    lui a2, 209715
 ; RV64-NEXT:    lui a3, 61681
@@ -1729,58 +1696,43 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev
 ; RV64-NEXT:    addiw a3, a3, -241
 ; RV64-NEXT:    addiw a4, a4, 257
 ; RV64-NEXT:    slli a5, a1, 32
-; RV64-NEXT:    add a5, a1, a5
-; RV64-NEXT:    slli a1, a2, 32
-; RV64-NEXT:    add a6, a2, a1
-; RV64-NEXT:    slli a1, a3, 32
-; RV64-NEXT:    add a1, a3, a1
-; RV64-NEXT:    slli a2, a4, 32
-; RV64-NEXT:    add a2, a4, a2
-; RV64-NEXT:    addi a3, a0, -16
-; RV64-NEXT:    sltu a0, a0, a3
+; RV64-NEXT:    add a1, a1, a5
+; RV64-NEXT:    slli a5, a2, 32
+; RV64-NEXT:    add a2, a2, a5
+; RV64-NEXT:    slli a5, a3, 32
+; RV64-NEXT:    add a3, a3, a5
+; RV64-NEXT:    slli a5, a4, 32
+; RV64-NEXT:    add a4, a4, a5
+; RV64-NEXT:    addi a5, a0, -16
+; RV64-NEXT:    sltu a0, a0, a5
 ; RV64-NEXT:    addi a0, a0, -1
-; RV64-NEXT:    and a0, a0, a3
-; RV64-NEXT:    li a3, 56
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a6, v0.t
+; RV64-NEXT:    and a0, a0, a5
+; RV64-NEXT:    li a5, 56
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a6, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v8, v8, a3, v0.t
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v8, v8, a3, v0.t
+; RV64-NEXT:    vmul.vx v8, v8, a4, v0.t
+; RV64-NEXT:    vsrl.vx v8, v8, a5, v0.t
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
-; RV64-NEXT:    vsub.vv v16, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v16, a6, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a1, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a2, v0.t
 ; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a1, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a4, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a5, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.ctpop.v32i64(<32 x i64> %va, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
index cd4b19f11d1602..13fcaa3654f0c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll
@@ -1769,99 +1769,64 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a4), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -1883,52 +1848,35 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v24, 2, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -1947,20 +1895,9 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ;
 ; RV64-LABEL: vp_cttz_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB34_2
 ; RV64-NEXT:  # %bb.1:
@@ -1989,53 +1926,38 @@ define <32 x i64> @vp_cttz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl
 ; RV64-NEXT:    and a7, a0, a7
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsub.vx v24, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vsub.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 false, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
@@ -3955,99 +3877,64 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsub.vx v16, v8, a1, v0.t
 ; RV32-NEXT:    vnot.v v8, v8, v0.t
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 24
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 48
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV32-NEXT:    vlse64.v v8, (a3), zero
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, sp, 32
-; RV32-NEXT:    vlse64.v v8, (a3), zero
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 5
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v16, (a3), zero
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v24, v24, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vsub.vv v24, v16, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v16, v24, 2, v0.t
-; RV32-NEXT:    vand.vv v16, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
 ; RV32-NEXT:    li a4, 24
 ; RV32-NEXT:    mul a3, a3, a4
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
 ; RV32-NEXT:    addi a3, sp, 24
 ; RV32-NEXT:    addi a4, sp, 16
 ; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vlse64.v v16, (a3), zero
-; RV32-NEXT:    addi a3, sp, 48
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vlse64.v v8, (a4), zero
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 24
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    slli a3, a3, 3
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 48
-; RV32-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vlse64.v v24, (a4), zero
+; RV32-NEXT:    addi a3, sp, 48
+; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT:    vsrl.vi v24, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v24, v0.t
-; RV32-NEXT:    vand.vv v16, v8, v16, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 24
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a2, a2, 5
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 48
-; RV32-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV32-NEXT:    vmul.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    vmul.vv v8, v8, v24, v0.t
 ; RV32-NEXT:    li a2, 56
 ; RV32-NEXT:    vsrl.vx v8, v8, a2, v0.t
 ; RV32-NEXT:    csrr a3, vlenb
@@ -4069,52 +3956,35 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV32-NEXT:    vsub.vx v8, v16, a1, v0.t
 ; RV32-NEXT:    vnot.v v16, v16, v0.t
 ; RV32-NEXT:    vand.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 1, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a1, 40
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v16, v0.t
-; RV32-NEXT:    vsub.vv v24, v8, v16, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV32-NEXT:    vsub.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v16, v24, v8, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
+; RV32-NEXT:    li a1, 24
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; RV32-NEXT:    vsrl.vi v8, v24, 2, v0.t
+; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; RV32-NEXT:    vand.vv v16, v8, v24, v0.t
+; RV32-NEXT:    vsrl.vi v8, v8, 2, v0.t
+; RV32-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
+; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
+; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 40
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV32-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV32-NEXT:    vadd.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    addi a0, sp, 48
 ; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 24
-; RV32-NEXT:    mul a0, a0, a1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 48
-; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmul.vv v8, v8, v16, v0.t
 ; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
 ; RV32-NEXT:    csrr a0, vlenb
@@ -4133,20 +4003,9 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ;
 ; RV64-LABEL: vp_cttz_zero_undef_v32i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -16
-; RV64-NEXT:    .cfi_def_cfa_offset 16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    sub sp, sp, a1
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 3
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a1, 16
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vslidedown.vi v7, v0, 2
 ; RV64-NEXT:    mv a4, a0
 ; RV64-NEXT:    bltu a0, a1, .LBB70_2
 ; RV64-NEXT:  # %bb.1:
@@ -4175,53 +4034,38 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z
 ; RV64-NEXT:    and a7, a0, a7
 ; RV64-NEXT:    li a0, 56
 ; RV64-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
+; RV64-NEXT:    vsub.vx v24, v8, a1, v0.t
 ; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
+; RV64-NEXT:    vand.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
+; RV64-NEXT:    vsub.vv v8, v8, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v8, a5, v0.t
 ; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
+; RV64-NEXT:    vadd.vv v8, v24, v8, v0.t
+; RV64-NEXT:    vsrl.vi v24, v8, 4, v0.t
+; RV64-NEXT:    vadd.vv v8, v8, v24, v0.t
 ; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
 ; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
 ; RV64-NEXT:    vsrl.vx v8, v8, a0, v0.t
-; RV64-NEXT:    addi a4, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    slli a4, a4, 3
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
-; RV64-NEXT:    vsub.vx v16, v8, a1, v0.t
-; RV64-NEXT:    vnot.v v8, v8, v0.t
-; RV64-NEXT:    vand.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 1, v0.t
-; RV64-NEXT:    vand.vx v16, v16, a6, v0.t
-; RV64-NEXT:    vsub.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v16, v8, a5, v0.t
-; RV64-NEXT:    vsrl.vi v8, v8, 2, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a5, v0.t
-; RV64-NEXT:    vadd.vv v8, v16, v8, v0.t
-; RV64-NEXT:    vsrl.vi v16, v8, 4, v0.t
-; RV64-NEXT:    vadd.vv v8, v8, v16, v0.t
-; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
-; RV64-NEXT:    vmul.vx v8, v8, a3, v0.t
-; RV64-NEXT:    vsrl.vx v16, v8, a0, v0.t
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 16
-; RV64-NEXT:    addi sp, sp, 16
-; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    vsub.vx v24, v16, a1, v0.t
+; RV64-NEXT:    vnot.v v16, v16, v0.t
+; RV64-NEXT:    vand.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 1, v0.t
+; RV64-NEXT:    vand.vx v24, v24, a6, v0.t
+; RV64-NEXT:    vsub.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v24, v16, a5, v0.t
+; RV64-NEXT:    vsrl.vi v16, v16, 2, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a5, v0.t
+; RV64-NEXT:    vadd.vv v16, v24, v16, v0.t
+; RV64-NEXT:    vsrl.vi v24, v16, 4, v0.t
+; RV64-NEXT:    vadd.vv v16, v16, v24, v0.t
+; RV64-NEXT:    vand.vx v16, v16, a2, v0.t
+; RV64-NEXT:    vmul.vx v16, v16, a3, v0.t
+; RV64-NEXT:    vsrl.vx v16, v16, a0, v0.t
 ; RV64-NEXT:    ret
   %v = call <32 x i64> @llvm.vp.cttz.v32i64(<32 x i64> %va, i1 true, <32 x i1> %m, i32 %evl)
   ret <32 x i64> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
index 02e99ea513e69b..97d544980bc9eb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll
@@ -737,29 +737,18 @@ declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -767,43 +756,28 @@ define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 2
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.floor.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
index f43934afc370df..617e226df92986 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -595,23 +595,18 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -624,43 +619,29 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -670,7 +651,8 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -678,7 +660,7 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -688,13 +670,12 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
index 7067cc21ab56d5..cef45258c61df8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -595,23 +595,18 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -624,43 +619,29 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -670,7 +651,8 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    and a0, a1, a0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
@@ -678,7 +660,7 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
@@ -688,13 +670,12 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v25
-; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
index a25014295f9e88..ea2809aa95dade 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-explodevector.ll
@@ -869,9 +869,8 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    .cfi_offset s10, -44
 ; RV32-NEXT:    .cfi_offset s11, -48
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 8 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v24, v8, 2
 ; RV32-NEXT:    li a0, 32
@@ -913,8 +912,6 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    vsrl.vx v16, v16, a0
 ; RV32-NEXT:    vmv.x.s s3, v16
 ; RV32-NEXT:    vslidedown.vi v16, v8, 13
-; RV32-NEXT:    addi s4, sp, 16
-; RV32-NEXT:    vs8r.v v16, (s4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv.x.s s4, v24
 ; RV32-NEXT:    vsrl.vx v24, v24, a0
 ; RV32-NEXT:    vmv.x.s s5, v24
@@ -924,19 +921,21 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    vmv.x.s s7, v0
 ; RV32-NEXT:    vmv.s.x v7, zero
 ; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; RV32-NEXT:    vredxor.vs v16, v8, v7
+; RV32-NEXT:    vredxor.vs v7, v8, v7
+; RV32-NEXT:    addi s8, sp, 16
+; RV32-NEXT:    vs1r.v v7, (s8) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vslidedown.vi v8, v8, 15
-; RV32-NEXT:    addi s8, sp, 16
-; RV32-NEXT:    vl8r.v v0, (s8) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.x.s s8, v0
-; RV32-NEXT:    vsrl.vx v0, v0, a0
-; RV32-NEXT:    vmv.x.s s9, v0
+; RV32-NEXT:    vmv.x.s s8, v16
+; RV32-NEXT:    vsrl.vx v16, v16, a0
+; RV32-NEXT:    vmv.x.s s9, v16
 ; RV32-NEXT:    vsrl.vx v0, v24, a0
+; RV32-NEXT:    addi s10, sp, 16
+; RV32-NEXT:    vl1r.v v17, (s10) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT:    vsrl.vx v17, v16, a0
-; RV32-NEXT:    vmv.x.s s10, v16
-; RV32-NEXT:    vmv.x.s s11, v17
+; RV32-NEXT:    vsrl.vx v16, v17, a0
+; RV32-NEXT:    vmv.x.s s10, v17
+; RV32-NEXT:    vmv.x.s s11, v16
 ; RV32-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
 ; RV32-NEXT:    vsrl.vx v16, v8, a0
 ; RV32-NEXT:    add a2, s11, a2
@@ -1000,7 +999,6 @@ define i64 @explode_16xi64(<16 x i64> %v) {
 ; RV32-NEXT:    sltu a2, a0, a2
 ; RV32-NEXT:    add a1, a1, a2
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add sp, sp, a2
 ; RV32-NEXT:    .cfi_def_cfa sp, 64
 ; RV32-NEXT:    lw s0, 60(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 651674ee9a5022..a49da09426d11a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -183,59 +183,72 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 6
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    li a3, 61
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc1, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 65 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 61 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    addi a4, a1, 128
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    lui a5, 12291
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle32.v v24, (a1)
+; RV32-NEXT:    vle32.v v0, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a6, 41
+; RV32-NEXT:    li a6, 45
 ; RV32-NEXT:    mul a1, a1, a6
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_0)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_0)
-; RV32-NEXT:    vle16.v v4, (a1)
+; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a6, 37
+; RV32-NEXT:    mul a1, a1, a6
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 1
 ; RV32-NEXT:    addi a5, a5, 3
 ; RV32-NEXT:    vle32.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a6, 57
+; RV32-NEXT:    li a6, 53
 ; RV32-NEXT:    mul a4, a4, a6
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a1, a1, -64
 ; RV32-NEXT:    vle32.v v16, (a3)
-; RV32-NEXT:    vmv.s.x v3, a5
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 29
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v28, a5
+; RV32-NEXT:    vmv.s.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 13
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vcompress.vm v8, v24, v3
+; RV32-NEXT:    vs1r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vcompress.vm v8, v0, v28
+; RV32-NEXT:    vmv1r.v v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    lui a1, 12
-; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 49
-; RV32-NEXT:    mul a3, a3, a4
-; RV32-NEXT:    add a3, sp, a3
-; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vslideup.vi v12, v16, 4
 ; RV32-NEXT:    csrr a3, vlenb
@@ -245,22 +258,22 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs4r.v v12, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    vmv.s.x v0, a1
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vslideup.vi v12, v24, 10, v0.t
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v24, v16, 16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vslideup.vi v12, v24, 10, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    csrr a1, vlenb
@@ -273,16 +286,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_1)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_1)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v28, (a3)
+; RV32-NEXT:    vle16.v v20, (a3)
 ; RV32-NEXT:    addi a1, a1, 12
-; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    vmv.s.x v24, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vcompress.vm v8, v0, v20
+; RV32-NEXT:    vcompress.vm v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 13
 ; RV32-NEXT:    mul a1, a1, a3
@@ -290,14 +303,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -310,6 +323,12 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vslideup.vi v12, v24, 8, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v12, v8
@@ -326,13 +345,9 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a5, %hi(.LCPI8_3)
 ; RV32-NEXT:    addi a5, a5, %lo(.LCPI8_3)
 ; RV32-NEXT:    addi a1, a1, 48
-; RV32-NEXT:    vmv.s.x v0, a4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vmv.s.x v16, a4
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v4, (a3)
+; RV32-NEXT:    vle16.v v20, (a3)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a5)
 ; RV32-NEXT:    csrr a3, vlenb
@@ -341,49 +356,52 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
 ; RV32-NEXT:    vs2r.v v8, (a3) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv.s.x v22, a1
+; RV32-NEXT:    vmv.s.x v26, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vcompress.vm v8, v24, v22
+; RV32-NEXT:    vcompress.vm v8, v0, v26
+; RV32-NEXT:    vmv1r.v v0, v16
+; RV32-NEXT:    vmv1r.v v6, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a3, a1, 3
 ; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl2r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v0, v16
+; RV32-NEXT:    vrgatherei16.vv v12, v16, v20
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl1r.v v7, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v16, 6, v0.t
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v24, 6, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    csrr a1, vlenb
@@ -403,14 +421,17 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle16.v v12, (a4)
 ; RV32-NEXT:    vmv.s.x v14, a1
-; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vcompress.vm v16, v24, v14
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a3, 45
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; RV32-NEXT:    vcompress.vm v16, v24, v14
+; RV32-NEXT:    vmv1r.v v0, v6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -421,78 +442,73 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v4, v0, v12
+; RV32-NEXT:    vrgatherei16.vv v20, v16, v12
+; RV32-NEXT:    vmv1r.v v0, v7
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 37
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v20, v8, 4, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v4, v8, 4, v0.t
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, 768
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_6)
 ; RV32-NEXT:    li a4, 1008
 ; RV32-NEXT:    addi a1, a1, 768
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vle16.v v8, (a3)
+; RV32-NEXT:    vle16.v v20, (a3)
 ; RV32-NEXT:    vmv.s.x v1, a4
-; RV32-NEXT:    vmv.s.x v12, a1
+; RV32-NEXT:    vmv.s.x v2, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vcompress.vm v24, v16, v12
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vcompress.vm v8, v24, v2
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 57
+; RV32-NEXT:    li a3, 53
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI8_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI8_7)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v20, (a1)
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 29
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v20, v16, 6
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v24, 6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v20, v24, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v20, v0.t
 ; RV32-NEXT:    lui a1, 3073
 ; RV32-NEXT:    lui a3, %hi(.LCPI8_8)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI8_8)
@@ -500,26 +516,26 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI8_9)
 ; RV32-NEXT:    addi a1, a1, -1024
 ; RV32-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; RV32-NEXT:    vle16.v v16, (a3)
+; RV32-NEXT:    vle16.v v4, (a3)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle16.v v2, (a4)
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 45
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    vcompress.vm v8, v24, v0
+; RV32-NEXT:    vcompress.vm v16, v24, v0
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 57
+; RV32-NEXT:    li a2, 53
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vrgatherei16.vv v16, v24, v4, v0.t
 ; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
@@ -527,36 +543,36 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 37
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v24, v2, v0.t
+; RV32-NEXT:    vrgatherei16.vv v20, v24, v2, v0.t
 ; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a2, 25
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v24
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 25
-; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv.v.v v20, v24
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v24, v0
 ; RV32-NEXT:    vmv.v.v v12, v8
+; RV32-NEXT:    vmv.v.v v20, v16
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v12, (a1)
-; RV32-NEXT:    addi a1, a0, 256
 ; RV32-NEXT:    vse32.v v20, (a1)
+; RV32-NEXT:    addi a1, a0, 256
+; RV32-NEXT:    vse32.v v12, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
-; RV32-NEXT:    vse32.v v4, (a1)
+; RV32-NEXT:    vse32.v v24, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
 ; RV32-NEXT:    slli a3, a2, 3
@@ -581,8 +597,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a1, a0, 6
-; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    li a1, 61
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    .cfi_def_cfa sp, 16
 ; RV32-NEXT:    addi sp, sp, 16
@@ -594,331 +610,237 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 67
+; RV64-NEXT:    li a3, 53
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc3, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 67 * vlenb
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x35, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 53 * vlenb
 ; RV64-NEXT:    addi a2, a1, 128
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 59
+; RV64-NEXT:    li a4, 37
 ; RV64-NEXT:    mul a3, a3, a4
 ; RV64-NEXT:    add a3, sp, a3
 ; RV64-NEXT:    addi a3, a3, 16
 ; RV64-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a1, a1, 256
 ; RV64-NEXT:    li a3, 128
-; RV64-NEXT:    vle64.v v24, (a1)
+; RV64-NEXT:    vle64.v v8, (a1)
 ; RV64-NEXT:    lui a1, 1
-; RV64-NEXT:    vid.v v8
-; RV64-NEXT:    vmv.s.x v0, a3
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 30
-; RV64-NEXT:    mul a3, a3, a4
-; RV64-NEXT:    add a3, sp, a3
-; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    vmv.s.x v1, a3
 ; RV64-NEXT:    li a3, 6
-; RV64-NEXT:    vmul.vx v6, v8, a3
+; RV64-NEXT:    vmul.vx v2, v16, a3
 ; RV64-NEXT:    li a3, 56
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgather.vi v8, v24, 4
-; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    li a5, 22
-; RV64-NEXT:    mul a4, a4, a5
-; RV64-NEXT:    add a4, sp, a4
-; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v16, v8, 4
 ; RV64-NEXT:    vsetivli zero, 8, e64, m8, ta, ma
-; RV64-NEXT:    vslidedown.vi v16, v24, 8
+; RV64-NEXT:    vslidedown.vi v24, v8, 8
 ; RV64-NEXT:    csrr a4, vlenb
-; RV64-NEXT:    li a5, 39
+; RV64-NEXT:    li a5, 29
 ; RV64-NEXT:    mul a4, a4, a5
 ; RV64-NEXT:    add a4, sp, a4
 ; RV64-NEXT:    addi a4, a4, 16
-; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    addi a4, sp, 16
+; RV64-NEXT:    vs1r.v v1, (a4) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v16, 2, v0.t
-; RV64-NEXT:    vmv.v.v v20, v8
-; RV64-NEXT:    vmv.s.x v8, a3
-; RV64-NEXT:    csrr a3, vlenb
-; RV64-NEXT:    li a4, 55
-; RV64-NEXT:    mul a3, a3, a4
-; RV64-NEXT:    add a3, sp, a3
-; RV64-NEXT:    addi a3, a3, 16
-; RV64-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v16, v24, 2, v0.t
+; RV64-NEXT:    vmv.v.v v4, v16
+; RV64-NEXT:    vmv.s.x v12, a3
 ; RV64-NEXT:    addi a3, a1, 65
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vle64.v v8, (a2)
+; RV64-NEXT:    vle64.v v16, (a2)
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a4, 47
+; RV64-NEXT:    li a4, 45
 ; RV64-NEXT:    mul a2, a2, a4
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v16, a3
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs1r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v13, a3
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v6, -16
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv2r.v v18, v6
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 12
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vadd.vi v14, v2, -16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v24, v0, v16
+; RV64-NEXT:    vcompress.vm v16, v24, v13
+; RV64-NEXT:    vmv1r.v v0, v12
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v14, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v20, v24
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 18
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v20, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.v.v v4, v16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
+; RV64-NEXT:    li a3, 13
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vs4r.v v4, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v8, v24, 5
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 30
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgather.vi v16, v8, 5
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
+; RV64-NEXT:    li a3, 29
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgather.vi v8, v24, 3, v0.t
-; RV64-NEXT:    vmv.v.v v20, v8
+; RV64-NEXT:    vrgather.vi v16, v24, 3, v0.t
+; RV64-NEXT:    vmv.v.v v4, v16
 ; RV64-NEXT:    lui a2, 2
 ; RV64-NEXT:    addi a2, a2, 130
-; RV64-NEXT:    vmv.s.x v8, a2
+; RV64-NEXT:    vmv.s.x v13, a2
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v16, v18, -15
+; RV64-NEXT:    vadd.vi v14, v2, -15
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v24, v0, v8
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vcompress.vm v16, v24, v13
+; RV64-NEXT:    vmv1r.v v0, v12
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v24, v8, v16, v0.t
+; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v16, v24, v14, v0.t
 ; RV64-NEXT:    vsetivli zero, 6, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v20, v24
+; RV64-NEXT:    vmv.v.v v4, v16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 3
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v20, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v4, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a2, 16
 ; RV64-NEXT:    addi a2, a2, 7
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 6
-; RV64-NEXT:    vmv.v.x v9, a2
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.i v16, 6
+; RV64-NEXT:    vmv.v.x v17, a2
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v8
+; RV64-NEXT:    vrgatherei16.vv v12, v8, v16
+; RV64-NEXT:    vrgatherei16.vv v20, v8, v17
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 25
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgatherei16.vv v12, v16, v9
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv4r.v v8, v16
-; RV64-NEXT:    vrgather.vi v12, v16, 2
+; RV64-NEXT:    vs4r.v v20, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v16, v8, 2
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
+; RV64-NEXT:    li a3, 21
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vrgather.vi v12, v16, 3
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vs4r.v v16, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgather.vi v4, v8, 3
 ; RV64-NEXT:    lui a2, 4
 ; RV64-NEXT:    li a3, 24
 ; RV64-NEXT:    addi a2, a2, 260
-; RV64-NEXT:    vmv.s.x v0, a3
-; RV64-NEXT:    addi a3, sp, 16
-; RV64-NEXT:    vs1r.v v0, (a3) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv.s.x v24, a2
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 12
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl2r.v v2, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.s.x v1, a3
+; RV64-NEXT:    vmv.s.x v8, a2
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v6, v2, -14
+; RV64-NEXT:    vadd.vi v10, v2, -14
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v8, v16, v24
+; RV64-NEXT:    vcompress.vm v24, v16, v8
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v6, v0.t
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v10, v0.t
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vl1r.v v29, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv1r.v v0, v29
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 30
+; RV64-NEXT:    li a3, 29
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v1, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
+; RV64-NEXT:    vrgather.vi v12, v16, 4, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    lui a2, 8
+; RV64-NEXT:    addi a2, a2, 520
+; RV64-NEXT:    vmv.s.x v28, a2
+; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; RV64-NEXT:    vadd.vi v30, v2, -13
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v28, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; RV64-NEXT:    vrgather.vi v28, v24, 4, v0.t
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vcompress.vm v8, v16, v28
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs4r.v v28, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    lui a2, 8
-; RV64-NEXT:    addi a2, a2, 520
-; RV64-NEXT:    vmv.s.x v7, a2
-; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v4, v2, -13
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v30, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
-; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v8, v24, v7
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv1r.v v0, v29
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 39
+; RV64-NEXT:    li a3, 29
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
+; RV64-NEXT:    li a3, 25
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgather.vi v8, v16, 5, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a3, a2, 5
-; RV64-NEXT:    sub a2, a3, a2
+; RV64-NEXT:    li a3, 25
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v8, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    lui a2, 96
 ; RV64-NEXT:    li a3, 192
-; RV64-NEXT:    vmv.s.x v1, a3
+; RV64-NEXT:    vmv.s.x v30, a3
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a2
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv1r.v v0, v30
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
+; RV64-NEXT:    li a3, 21
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
@@ -926,147 +848,133 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 35
+; RV64-NEXT:    li a3, 21
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    li a2, 1040
 ; RV64-NEXT:    li a3, 28
-; RV64-NEXT:    vmv.s.x v20, a2
-; RV64-NEXT:    vmv.s.x v0, a3
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 30
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv.s.x v31, a2
+; RV64-NEXT:    vmv.s.x v1, a3
 ; RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV64-NEXT:    vadd.vi v22, v2, -12
+; RV64-NEXT:    vadd.vi v28, v2, -12
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 59
+; RV64-NEXT:    li a3, 37
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vcompress.vm v8, v24, v20
+; RV64-NEXT:    vcompress.vm v8, v16, v31
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 47
+; RV64-NEXT:    li a3, 45
 ; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    vrgatherei16.vv v8, v24, v22, v0.t
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV64-NEXT:    lui a2, 112
 ; RV64-NEXT:    addi a2, a2, 1
 ; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v12, a2
-; RV64-NEXT:    vmv1r.v v0, v1
+; RV64-NEXT:    vmv1r.v v0, v30
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    li a3, 29
+; RV64-NEXT:    mul a2, a2, a3
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl4r.v v4, (a2) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
 ; RV64-NEXT:    vrgatherei16.vv v4, v16, v12, v0.t
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v12, (a2) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 22
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v16
+; RV64-NEXT:    vmv.v.v v12, v24
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vs4r.v v12, (a2) # Unknown-size Folded Spill
 ; RV64-NEXT:    addi a1, a1, -2016
 ; RV64-NEXT:    vmv.s.x v12, a1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 59
+; RV64-NEXT:    li a2, 37
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; RV64-NEXT:    vcompress.vm v16, v24, v12
+; RV64-NEXT:    vcompress.vm v24, v16, v12
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vadd.vi v12, v2, -11
+; RV64-NEXT:    vmv1r.v v0, v1
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 30
+; RV64-NEXT:    li a2, 45
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 47
+; RV64-NEXT:    li a2, 25
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vrgatherei16.vv v16, v24, v12, v0.t
+; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 5
-; RV64-NEXT:    sub a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsetivli zero, 5, e64, m4, tu, ma
-; RV64-NEXT:    vmv.v.v v12, v24
+; RV64-NEXT:    vmv.v.v v12, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 35
+; RV64-NEXT:    li a2, 21
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
-; RV64-NEXT:    vl4r.v v20, (a1) # Unknown-size Folded Reload
-; RV64-NEXT:    vmv.v.v v20, v8
-; RV64-NEXT:    vmv4r.v v8, v4
-; RV64-NEXT:    vmv.v.v v8, v16
+; RV64-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV64-NEXT:    vmv.v.v v16, v8
+; RV64-NEXT:    vmv.v.v v4, v24
 ; RV64-NEXT:    addi a1, a0, 256
 ; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; RV64-NEXT:    vse64.v v20, (a1)
+; RV64-NEXT:    vse64.v v16, (a1)
 ; RV64-NEXT:    addi a1, a0, 320
-; RV64-NEXT:    vse64.v v8, (a1)
+; RV64-NEXT:    vse64.v v4, (a1)
 ; RV64-NEXT:    addi a1, a0, 192
 ; RV64-NEXT:    vse64.v v12, (a1)
 ; RV64-NEXT:    addi a1, a0, 128
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 55
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 4
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    addi a1, a0, 64
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a2, a2, a3
+; RV64-NEXT:    slli a3, a2, 3
+; RV64-NEXT:    add a2, a3, a2
 ; RV64-NEXT:    add a2, sp, a2
 ; RV64-NEXT:    addi a2, a2, 16
 ; RV64-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a1)
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    li a2, 18
+; RV64-NEXT:    li a2, 13
 ; RV64-NEXT:    mul a1, a1, a2
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 16
 ; RV64-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV64-NEXT:    vse64.v v8, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 67
+; RV64-NEXT:    li a1, 53
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
index 46c2033d28b387..de90d250bfb299 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll
@@ -578,12 +578,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    lui a2, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a2)
@@ -595,14 +589,10 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a2, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a2
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    fsflags a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
@@ -620,12 +610,6 @@ define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    fsflags a0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 8bf30f8f0d072b..b6f911b8e477a2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -1945,12 +1945,6 @@ declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>)
 define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -1959,29 +1953,19 @@ define float @vreduce_fminimum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB119_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB119_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB119_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB119_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
   %red = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %v)
@@ -2019,57 +2003,57 @@ define float @vreduce_fminimum_v128f32(ptr %x) {
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a2)
+; CHECK-NEXT:    vle32.v v24, (a2)
 ; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v24, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB121_2
@@ -2265,12 +2249,6 @@ declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)
 define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fminimum_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0)
@@ -2278,29 +2256,19 @@ define double @vreduce_fminimum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB131_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI131_0)
 ; CHECK-NEXT:    fld fa0, %lo(.LCPI131_0)(a0)
-; CHECK-NEXT:    j .LBB131_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB131_2:
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB131_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v)
@@ -2336,57 +2304,57 @@ define double @vreduce_fminimum_v64f64(ptr %x) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v24, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB133_2
@@ -2659,12 +2627,6 @@ declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>)
 define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v64f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
@@ -2673,29 +2635,19 @@ define float @vreduce_fmaximum_v64f32(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB147_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, 523264
 ; CHECK-NEXT:    fmv.w.x fa0, a0
-; CHECK-NEXT:    j .LBB147_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB147_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB147_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = load <64 x float>, ptr %x
   %red = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %v)
@@ -2733,57 +2685,57 @@ define float @vreduce_fmaximum_v128f32(ptr %x) {
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    addi a2, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a2)
+; CHECK-NEXT:    vle32.v v24, (a2)
 ; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle32.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle32.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle32.v v8, (a1)
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v24, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB149_2
@@ -2979,12 +2931,6 @@ declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
 define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-LABEL: vreduce_fmaximum_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a0)
@@ -2992,29 +2938,19 @@ define double @vreduce_fmaximum_v32f64(ptr %x) {
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB159_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    lui a0, %hi(.LCPI159_0)
 ; CHECK-NEXT:    fld fa0, %lo(.LCPI159_0)(a0)
-; CHECK-NEXT:    j .LBB159_3
+; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB159_2:
 ; CHECK-NEXT:    vfredmax.vs v8, v8, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
-; CHECK-NEXT:  .LBB159_3:
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = load <32 x double>, ptr %x
   %red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v)
@@ -3050,57 +2986,57 @@ define double @vreduce_fmaximum_v64f64(ptr %x) {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    addi a1, a0, 384
 ; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi a1, a0, 256
-; CHECK-NEXT:    vle64.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v16, v16
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmfeq.vv v0, v24, v24
+; CHECK-NEXT:    vmfeq.vv v7, v16, v16
+; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a1)
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v24, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v7, v8, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v16, v16, v8, v0
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmfeq.vv v0, v8, v8
-; CHECK-NEXT:    vmfeq.vv v7, v24, v24
-; CHECK-NEXT:    vmerge.vvm v16, v8, v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
-; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
 ; CHECK-NEXT:    vmfne.vv v16, v8, v8
 ; CHECK-NEXT:    vcpop.m a0, v16
 ; CHECK-NEXT:    beqz a0, .LBB161_2
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
index b8617fda3aa7ec..4c4497482e75cd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll
@@ -526,24 +526,14 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    addi a1, a0, -16
 ; CHECK-NEXT:    sltu a0, a0, a1
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    and a0, a0, a1
@@ -564,12 +554,6 @@ define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroex
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.rint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
index 820a05e3d6042b..5136505cdea9f9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll
@@ -737,29 +737,18 @@ declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -767,43 +756,28 @@ define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroe
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 4
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.round.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
index 8391c7939180a0..726541dc6d9d44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll
@@ -737,29 +737,18 @@ declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -767,43 +756,28 @@ define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 z
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 0
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.roundeven.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
index 8c38d244602655..3ac21ce8793078 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll
@@ -737,29 +737,18 @@ declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32)
 define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_v32f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v25, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v6, v0
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:    bltu a0, a2, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:  .LBB26_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    lui a1, %hi(.LCPI26_0)
 ; CHECK-NEXT:    fld fa5, %lo(.LCPI26_0)(a1)
 ; CHECK-NEXT:    addi a1, a0, -16
@@ -767,43 +756,28 @@ define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32
 ; CHECK-NEXT:    addi a0, a0, -1
 ; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v25, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a1, 1
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a1
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v8, v16, v0.t
+; CHECK-NEXT:    vfabs.v v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v24, v8, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v8, v16, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v16, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v = call <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl)
   ret <32 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 03d5762b4903ef..90d4a5e511c870 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -1063,49 +1063,39 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFH-NEXT:    addi sp, sp, -16
 ; ZVFH-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 4
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    add a1, sp, a1
-; ZVFH-NEXT:    addi a1, a1, 16
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    addi a1, a0, 128
 ; ZVFH-NEXT:    li a3, 64
 ; ZVFH-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
 ; ZVFH-NEXT:    vle16.v v16, (a1)
-; ZVFH-NEXT:    addi a1, sp, 16
-; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vle16.v v16, (a0)
+; ZVFH-NEXT:    vle16.v v24, (a0)
 ; ZVFH-NEXT:    mv a0, a2
 ; ZVFH-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
-; ZVFH-NEXT:    vslidedown.vi v24, v0, 8
+; ZVFH-NEXT:    vslidedown.vi v7, v0, 8
 ; ZVFH-NEXT:    bltu a2, a3, .LBB43_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    li a0, 64
 ; ZVFH-NEXT:  .LBB43_2:
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v7, v8, v16, v0.t
+; ZVFH-NEXT:    vmfeq.vv v6, v8, v24, v0.t
 ; ZVFH-NEXT:    addi a0, a2, -64
 ; ZVFH-NEXT:    sltu a1, a2, a0
 ; ZVFH-NEXT:    addi a1, a1, -1
 ; ZVFH-NEXT:    and a0, a1, a0
-; ZVFH-NEXT:    vmv1r.v v0, v24
-; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    add a1, sp, a1
-; ZVFH-NEXT:    addi a1, a1, 16
-; ZVFH-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; ZVFH-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFH-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; ZVFH-NEXT:    vslideup.vi v7, v8, 8
-; ZVFH-NEXT:    vmv.v.v v0, v7
+; ZVFH-NEXT:    vslideup.vi v6, v8, 8
+; ZVFH-NEXT:    vmv.v.v v0, v6
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add sp, sp, a0
 ; ZVFH-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFH-NEXT:    addi sp, sp, 16
@@ -1143,7 +1133,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    addi s0, sp, 896
 ; ZVFHMIN32-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN32-NEXT:    csrr a1, vlenb
-; ZVFHMIN32-NEXT:    li a2, 30
+; ZVFHMIN32-NEXT:    li a2, 28
 ; ZVFHMIN32-NEXT:    mul a1, a1, a2
 ; ZVFHMIN32-NEXT:    sub sp, sp, a1
 ; ZVFHMIN32-NEXT:    andi sp, sp, -128
@@ -1315,67 +1305,66 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 29
+; ZVFHMIN32-NEXT:    li a3, 27
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 6
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 28
+; ZVFHMIN32-NEXT:    li a3, 26
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 5
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 27
+; ZVFHMIN32-NEXT:    li a3, 25
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 4
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 26
+; ZVFHMIN32-NEXT:    li a3, 24
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 3
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 25
+; ZVFHMIN32-NEXT:    li a3, 23
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 2
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 24
+; ZVFHMIN32-NEXT:    li a3, 22
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 1
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 23
+; ZVFHMIN32-NEXT:    li a3, 21
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v10, v8, 14
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN32-NEXT:    vslidedown.vi v30, v8, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v28, v8, 11
+; ZVFHMIN32-NEXT:    vslidedown.vi v26, v8, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v22, v8, 9
+; ZVFHMIN32-NEXT:    vslidedown.vi v20, v8, 8
 ; ZVFHMIN32-NEXT:    vmv.x.s a4, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
@@ -1384,52 +1373,54 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v5, v16, 5
-; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 7
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a3, a2, 1
+; ZVFHMIN32-NEXT:    add a2, a3, a2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v21, v16, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 5
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 20
+; ZVFHMIN32-NEXT:    li a3, 18
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 1
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vslidedown.vi v23, v16, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v31, v16, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN32-NEXT:    addi a2, sp, 848
 ; ZVFHMIN32-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v7, v16, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN32-NEXT:    vslidedown.vi v14, v16, 15
+; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 14
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN32-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN32-NEXT:    vslidedown.vi v18, v16, 12
 ; ZVFHMIN32-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v16, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    li a3, 19
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v16, 9
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 14
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v6, v16, 8
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v16, v16, 8
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1437,95 +1428,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 7
-; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 6
-; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 5
-; ZVFHMIN32-NEXT:    vslidedown.vi v7, v0, 4
-; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 3
-; ZVFHMIN32-NEXT:    vslidedown.vi v21, v0, 2
-; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN32-NEXT:    vslidedown.vi v29, v0, 7
+; ZVFHMIN32-NEXT:    vslidedown.vi v19, v0, 6
+; ZVFHMIN32-NEXT:    vslidedown.vi v27, v0, 5
+; ZVFHMIN32-NEXT:    vslidedown.vi v13, v0, 4
+; ZVFHMIN32-NEXT:    vslidedown.vi v15, v0, 3
+; ZVFHMIN32-NEXT:    vslidedown.vi v9, v0, 2
+; ZVFHMIN32-NEXT:    vslidedown.vi v11, v0, 1
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 15
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 14
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 13
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 14
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 6
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 12
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 13
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 12
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    slli a2, a2, 2
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 11
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 12
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    li a3, 10
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 10
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 11
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    slli a2, a2, 3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 10
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
 ; ZVFHMIN32-NEXT:    slli a2, a2, 4
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN32-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN32-NEXT:    vslidedown.vi v2, v0, 9
 ; ZVFHMIN32-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN32-NEXT:    addi a2, sp, 848
-; ZVFHMIN32-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN32-NEXT:    vmv.x.s t4, v26
+; ZVFHMIN32-NEXT:    vmv.x.s t3, v4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 300(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN32-NEXT:    vmv.x.s t1, v28
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    addi a2, a2, 848
+; ZVFHMIN32-NEXT:    vl2r.v v4, (a2) # Unknown-size Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s t4, v4
+; ZVFHMIN32-NEXT:    vmv.x.s t2, v30
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 298(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 1
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    addi a2, a2, 848
-; ZVFHMIN32-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN32-NEXT:    vmv.x.s t0, v4
+; ZVFHMIN32-NEXT:    vmv.x.s t1, v6
+; ZVFHMIN32-NEXT:    vmv.x.s t0, v28
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 296(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN32-NEXT:    vmv.x.s a6, v30
+; ZVFHMIN32-NEXT:    vmv.x.s a7, v26
+; ZVFHMIN32-NEXT:    vmv.x.s a6, v22
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 294(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a5, v22
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN32-NEXT:    vmv.x.s a5, v20
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v14
 ; ZVFHMIN32-NEXT:    sw a2, 112(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
@@ -1533,7 +1515,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 211(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 548(sp)
 ; ZVFHMIN32-NEXT:    lh t5, 292(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v14
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v12
 ; ZVFHMIN32-NEXT:    sw a0, 116(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    vmv.x.s a0, v8
 ; ZVFHMIN32-NEXT:    sw a0, 124(sp) # 4-byte Folded Spill
@@ -1560,7 +1542,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a1, 208(sp)
 ; ZVFHMIN32-NEXT:    lh t5, 738(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 482(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a0, v12
+; ZVFHMIN32-NEXT:    vmv.x.s a0, v18
 ; ZVFHMIN32-NEXT:    sw a0, 108(sp) # 4-byte Folded Spill
 ; ZVFHMIN32-NEXT:    vmv.x.s a0, v10
 ; ZVFHMIN32-NEXT:    sw a0, 120(sp) # 4-byte Folded Spill
@@ -1571,12 +1553,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t5, 736(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 480(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 29
+; ZVFHMIN32-NEXT:    li a1, 27
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s5, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 28
+; ZVFHMIN32-NEXT:    li a1, 26
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s6, 848(a0) # 8-byte Folded Reload
@@ -1587,12 +1569,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t5, 734(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 478(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 27
+; ZVFHMIN32-NEXT:    li a1, 25
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s7, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 26
+; ZVFHMIN32-NEXT:    li a1, 24
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s8, 848(a0) # 8-byte Folded Reload
@@ -1603,12 +1585,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t5, 732(sp)
 ; ZVFHMIN32-NEXT:    lh t6, 476(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 25
+; ZVFHMIN32-NEXT:    li a1, 23
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s4, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 24
+; ZVFHMIN32-NEXT:    li a1, 22
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s3, 848(a0) # 8-byte Folded Reload
@@ -1619,43 +1601,47 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh t6, 730(sp)
 ; ZVFHMIN32-NEXT:    lh s9, 474(sp)
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
-; ZVFHMIN32-NEXT:    li a1, 23
+; ZVFHMIN32-NEXT:    li a1, 21
 ; ZVFHMIN32-NEXT:    mul a0, a0, a1
 ; ZVFHMIN32-NEXT:    add a0, sp, a0
 ; ZVFHMIN32-NEXT:    lh s2, 848(a0) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN32-NEXT:    csrr a0, vlenb
+; ZVFHMIN32-NEXT:    slli a1, a0, 1
+; ZVFHMIN32-NEXT:    add a0, a1, a0
+; ZVFHMIN32-NEXT:    add a0, sp, a0
+; ZVFHMIN32-NEXT:    lh t5, 848(a0) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s9
 ; ZVFHMIN32-NEXT:    feq.h t6, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb t6, 173(sp)
 ; ZVFHMIN32-NEXT:    lh s9, 728(sp)
 ; ZVFHMIN32-NEXT:    lh s10, 472(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v31
-; ZVFHMIN32-NEXT:    vmv.x.s ra, v13
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v21
+; ZVFHMIN32-NEXT:    vmv.x.s ra, v29
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb s9, 172(sp)
 ; ZVFHMIN32-NEXT:    lh s9, 726(sp)
 ; ZVFHMIN32-NEXT:    lh s10, 470(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v29
-; ZVFHMIN32-NEXT:    vmv.x.s a3, v11
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v19
+; ZVFHMIN32-NEXT:    vmv.x.s a3, v27
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN32-NEXT:    feq.h s9, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb s9, 171(sp)
 ; ZVFHMIN32-NEXT:    lh s10, 724(sp)
 ; ZVFHMIN32-NEXT:    lh s11, 468(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s a4, v7
-; ZVFHMIN32-NEXT:    vmv.x.s s9, v9
+; ZVFHMIN32-NEXT:    vmv.x.s a4, v13
+; ZVFHMIN32-NEXT:    vmv.x.s s9, v15
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, s10
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, s11
 ; ZVFHMIN32-NEXT:    feq.h s10, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb s10, 170(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 466(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s10, v21
-; ZVFHMIN32-NEXT:    vmv.x.s s11, v27
+; ZVFHMIN32-NEXT:    vmv.x.s s10, v9
+; ZVFHMIN32-NEXT:    vmv.x.s s11, v11
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1738,7 +1724,11 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a1, 161(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 354(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s s6, v5
+; ZVFHMIN32-NEXT:    csrr a2, vlenb
+; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
+; ZVFHMIN32-NEXT:    add a2, sp, a2
+; ZVFHMIN32-NEXT:    lh s6, 848(a2) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s5, v23
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
@@ -1746,27 +1736,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    sb a0, 241(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 352(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 21
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s4, 848(a2) # 8-byte Folded Reload
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 20
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s3, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s4, v31
+; ZVFHMIN32-NEXT:    lh s3, 848(sp) # 8-byte Folded Reload
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 350(sp)
-; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 22
-; ZVFHMIN32-NEXT:    mul a2, a2, a3
-; ZVFHMIN32-NEXT:    add a2, sp, a2
-; ZVFHMIN32-NEXT:    lh s2, 848(a2) # 8-byte Folded Reload
+; ZVFHMIN32-NEXT:    vmv.x.s s2, v7
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa3, a1
@@ -1901,7 +1879,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    li a3, 18
+; ZVFHMIN32-NEXT:    li a3, 19
 ; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
@@ -1913,20 +1891,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t5, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 508(sp)
-; ZVFHMIN32-NEXT:    vmv.x.s t5, v6
 ; ZVFHMIN32-NEXT:    csrr a2, vlenb
-; ZVFHMIN32-NEXT:    slli a2, a2, 2
+; ZVFHMIN32-NEXT:    li a3, 12
+; ZVFHMIN32-NEXT:    mul a2, a2, a3
 ; ZVFHMIN32-NEXT:    add a2, sp, a2
 ; ZVFHMIN32-NEXT:    addi a2, a2, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN32-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN32-NEXT:    vmv.x.s a2, v16
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
@@ -1934,14 +1913,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 506(sp)
 ; ZVFHMIN32-NEXT:    csrr a3, vlenb
-; ZVFHMIN32-NEXT:    slli a3, a3, 3
+; ZVFHMIN32-NEXT:    li a4, 6
+; ZVFHMIN32-NEXT:    mul a3, a3, a4
 ; ZVFHMIN32-NEXT:    add a3, sp, a3
 ; ZVFHMIN32-NEXT:    addi a3, a3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s a3, v8
 ; ZVFHMIN32-NEXT:    csrr a4, vlenb
-; ZVFHMIN32-NEXT:    li s3, 6
-; ZVFHMIN32-NEXT:    mul a4, a4, s3
+; ZVFHMIN32-NEXT:    slli a4, a4, 2
 ; ZVFHMIN32-NEXT:    add a4, sp, a4
 ; ZVFHMIN32-NEXT:    addi a4, a4, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
@@ -1953,15 +1932,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 760(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 504(sp)
 ; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 12
+; ZVFHMIN32-NEXT:    li s4, 10
 ; ZVFHMIN32-NEXT:    mul s3, s3, s4
 ; ZVFHMIN32-NEXT:    add s3, sp, s3
 ; ZVFHMIN32-NEXT:    addi s3, s3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s6, v8
 ; ZVFHMIN32-NEXT:    csrr s3, vlenb
-; ZVFHMIN32-NEXT:    li s4, 10
-; ZVFHMIN32-NEXT:    mul s3, s3, s4
+; ZVFHMIN32-NEXT:    slli s3, s3, 3
 ; ZVFHMIN32-NEXT:    add s3, sp, s3
 ; ZVFHMIN32-NEXT:    addi s3, s3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
@@ -1978,38 +1956,38 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    addi s3, s3, 848
 ; ZVFHMIN32-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN32-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN32-NEXT:    vmv.x.s s3, v16
+; ZVFHMIN32-NEXT:    vmv.x.s s3, v2
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
 ; ZVFHMIN32-NEXT:    sb a0, 187(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 756(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 500(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN32-NEXT:    sb a0, 186(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 754(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 498(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN32-NEXT:    feq.h t3, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h t4, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
 ; ZVFHMIN32-NEXT:    sb a0, 185(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 752(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 496(sp)
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN32-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN32-NEXT:    feq.h t2, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t1
 ; ZVFHMIN32-NEXT:    sb a0, 184(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 750(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 494(sp)
@@ -2046,9 +2024,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN32-NEXT:    addi a1, sp, 848
-; ZVFHMIN32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN32-NEXT:    vmv.x.s a1, v0
 ; ZVFHMIN32-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 15
 ; ZVFHMIN32-NEXT:    vmv.x.s a5, v8
@@ -2064,9 +2040,9 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    lh a0, 740(sp)
 ; ZVFHMIN32-NEXT:    lh a7, 484(sp)
 ; ZVFHMIN32-NEXT:    sb a2, 140(sp)
-; ZVFHMIN32-NEXT:    sb t1, 141(sp)
-; ZVFHMIN32-NEXT:    sb t3, 142(sp)
-; ZVFHMIN32-NEXT:    sb t4, 143(sp)
+; ZVFHMIN32-NEXT:    sb t2, 141(sp)
+; ZVFHMIN32-NEXT:    sb t4, 142(sp)
+; ZVFHMIN32-NEXT:    sb t3, 143(sp)
 ; ZVFHMIN32-NEXT:    sb a1, 136(sp)
 ; ZVFHMIN32-NEXT:    sb a6, 137(sp)
 ; ZVFHMIN32-NEXT:    sb a4, 138(sp)
@@ -2181,7 +2157,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
 ; ZVFHMIN32-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN32-NEXT:    lh a0, 614(sp)
 ; ZVFHMIN32-NEXT:    lh a1, 358(sp)
@@ -2190,7 +2166,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN32-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN32-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN32-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN32-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN32-NEXT:    vslidedown.vi v8, v24, 8
 ; ZVFHMIN32-NEXT:    vmv.x.s a1, v8
 ; ZVFHMIN32-NEXT:    fmv.h.x fa4, a1
@@ -2277,7 +2253,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    addi s0, sp, 896
 ; ZVFHMIN64-NEXT:    .cfi_def_cfa s0, 0
 ; ZVFHMIN64-NEXT:    csrr a1, vlenb
-; ZVFHMIN64-NEXT:    li a2, 30
+; ZVFHMIN64-NEXT:    li a2, 28
 ; ZVFHMIN64-NEXT:    mul a1, a1, a2
 ; ZVFHMIN64-NEXT:    sub sp, sp, a1
 ; ZVFHMIN64-NEXT:    andi sp, sp, -128
@@ -2449,67 +2425,66 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 7
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 29
+; ZVFHMIN64-NEXT:    li a3, 27
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 6
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 28
+; ZVFHMIN64-NEXT:    li a3, 26
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 5
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 27
+; ZVFHMIN64-NEXT:    li a3, 25
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 4
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 26
+; ZVFHMIN64-NEXT:    li a3, 24
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 3
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 25
+; ZVFHMIN64-NEXT:    li a3, 23
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 2
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 24
+; ZVFHMIN64-NEXT:    li a3, 22
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 1
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 23
+; ZVFHMIN64-NEXT:    li a3, 21
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v10, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 14
-; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v10, v8, 14
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs2r.v v10, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v4, v8, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v2, v8, 10
-; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 9
-; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 8
+; ZVFHMIN64-NEXT:    vslidedown.vi v30, v8, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v6, v8, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v28, v8, 11
+; ZVFHMIN64-NEXT:    vslidedown.vi v26, v8, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v22, v8, 9
+; ZVFHMIN64-NEXT:    vslidedown.vi v20, v8, 8
 ; ZVFHMIN64-NEXT:    vmv.x.s a4, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
@@ -2518,52 +2493,54 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 560(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 304(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v3, v16, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v5, v16, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 7
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a3, a2, 1
+; ZVFHMIN64-NEXT:    add a2, a3, a2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v21, v16, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 5
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 20
+; ZVFHMIN64-NEXT:    li a3, 18
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 1
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vslidedown.vi v23, v16, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v31, v16, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 2
+; ZVFHMIN64-NEXT:    addi a2, sp, 800
 ; ZVFHMIN64-NEXT:    vs1r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v7, v16, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 15
-; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 14
+; ZVFHMIN64-NEXT:    vslidedown.vi v14, v16, 15
+; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 14
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v16, 13
-; ZVFHMIN64-NEXT:    vslidedown.vi v12, v16, 12
+; ZVFHMIN64-NEXT:    vslidedown.vi v18, v16, 12
 ; ZVFHMIN64-NEXT:    vslidedown.vi v10, v16, 11
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 10
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v16, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    li a3, 19
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 9
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v16, 9
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 14
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v6, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v6, v16, 8
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v16, v16, 8
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2571,95 +2548,86 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 558(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 302(sp)
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
-; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 7
-; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 6
-; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 5
-; ZVFHMIN64-NEXT:    vslidedown.vi v7, v0, 4
-; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 3
-; ZVFHMIN64-NEXT:    vslidedown.vi v21, v0, 2
-; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 1
+; ZVFHMIN64-NEXT:    vslidedown.vi v29, v0, 7
+; ZVFHMIN64-NEXT:    vslidedown.vi v19, v0, 6
+; ZVFHMIN64-NEXT:    vslidedown.vi v27, v0, 5
+; ZVFHMIN64-NEXT:    vslidedown.vi v13, v0, 4
+; ZVFHMIN64-NEXT:    vslidedown.vi v15, v0, 3
+; ZVFHMIN64-NEXT:    vslidedown.vi v9, v0, 2
+; ZVFHMIN64-NEXT:    vslidedown.vi v11, v0, 1
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 15
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 14
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 13
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 14
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 6
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 12
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 13
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 12
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    slli a2, a2, 2
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 11
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 12
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    li a3, 10
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 10
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 11
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    slli a2, a2, 3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 10
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
 ; ZVFHMIN64-NEXT:    slli a2, a2, 4
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vs2r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vslidedown.vi v16, v0, 9
+; ZVFHMIN64-NEXT:    vs2r.v v2, (a2) # Unknown-size Folded Spill
+; ZVFHMIN64-NEXT:    vslidedown.vi v2, v0, 9
 ; ZVFHMIN64-NEXT:    vslidedown.vi v0, v0, 8
-; ZVFHMIN64-NEXT:    addi a2, sp, 800
-; ZVFHMIN64-NEXT:    vs2r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN64-NEXT:    vmv.x.s t4, v26
+; ZVFHMIN64-NEXT:    vmv.x.s t3, v4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 215(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 556(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 300(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t3, v20
-; ZVFHMIN64-NEXT:    vmv.x.s t1, v28
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    addi a2, a2, 800
+; ZVFHMIN64-NEXT:    vl2r.v v4, (a2) # Unknown-size Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s t4, v4
+; ZVFHMIN64-NEXT:    vmv.x.s t2, v30
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 214(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 554(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 298(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 1
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    addi a2, a2, 800
-; ZVFHMIN64-NEXT:    vl2r.v v0, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t2, v0
-; ZVFHMIN64-NEXT:    vmv.x.s t0, v4
+; ZVFHMIN64-NEXT:    vmv.x.s t1, v6
+; ZVFHMIN64-NEXT:    vmv.x.s t0, v28
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 213(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 552(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 296(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a7, v2
-; ZVFHMIN64-NEXT:    vmv.x.s a6, v30
+; ZVFHMIN64-NEXT:    vmv.x.s a7, v26
+; ZVFHMIN64-NEXT:    vmv.x.s a6, v22
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 212(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 550(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 294(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a5, v22
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v18
+; ZVFHMIN64-NEXT:    vmv.x.s a5, v20
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v14
 ; ZVFHMIN64-NEXT:    sd a2, 96(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
@@ -2667,7 +2635,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 211(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 548(sp)
 ; ZVFHMIN64-NEXT:    lh t5, 292(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v14
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v12
 ; ZVFHMIN64-NEXT:    sd a0, 104(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    vmv.x.s a0, v8
 ; ZVFHMIN64-NEXT:    sd a0, 120(sp) # 8-byte Folded Spill
@@ -2694,7 +2662,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a1, 208(sp)
 ; ZVFHMIN64-NEXT:    lh t5, 738(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 482(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a0, v12
+; ZVFHMIN64-NEXT:    vmv.x.s a0, v18
 ; ZVFHMIN64-NEXT:    sd a0, 88(sp) # 8-byte Folded Spill
 ; ZVFHMIN64-NEXT:    vmv.x.s a0, v10
 ; ZVFHMIN64-NEXT:    sd a0, 112(sp) # 8-byte Folded Spill
@@ -2705,12 +2673,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t5, 736(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 480(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 29
+; ZVFHMIN64-NEXT:    li a1, 27
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s5, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 28
+; ZVFHMIN64-NEXT:    li a1, 26
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s6, 800(a0) # 8-byte Folded Reload
@@ -2721,12 +2689,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t5, 734(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 478(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 27
+; ZVFHMIN64-NEXT:    li a1, 25
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s7, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 26
+; ZVFHMIN64-NEXT:    li a1, 24
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s8, 800(a0) # 8-byte Folded Reload
@@ -2737,12 +2705,12 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t5, 732(sp)
 ; ZVFHMIN64-NEXT:    lh t6, 476(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 25
+; ZVFHMIN64-NEXT:    li a1, 23
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s4, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 24
+; ZVFHMIN64-NEXT:    li a1, 22
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s3, 800(a0) # 8-byte Folded Reload
@@ -2753,43 +2721,47 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh t6, 730(sp)
 ; ZVFHMIN64-NEXT:    lh s9, 474(sp)
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
-; ZVFHMIN64-NEXT:    li a1, 23
+; ZVFHMIN64-NEXT:    li a1, 21
 ; ZVFHMIN64-NEXT:    mul a0, a0, a1
 ; ZVFHMIN64-NEXT:    add a0, sp, a0
 ; ZVFHMIN64-NEXT:    lh s2, 800(a0) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v3
+; ZVFHMIN64-NEXT:    csrr a0, vlenb
+; ZVFHMIN64-NEXT:    slli a1, a0, 1
+; ZVFHMIN64-NEXT:    add a0, a1, a0
+; ZVFHMIN64-NEXT:    add a0, sp, a0
+; ZVFHMIN64-NEXT:    lh t5, 800(a0) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s9
 ; ZVFHMIN64-NEXT:    feq.h t6, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb t6, 173(sp)
 ; ZVFHMIN64-NEXT:    lh s9, 728(sp)
 ; ZVFHMIN64-NEXT:    lh s10, 472(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v31
-; ZVFHMIN64-NEXT:    vmv.x.s ra, v13
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v21
+; ZVFHMIN64-NEXT:    vmv.x.s ra, v29
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb s9, 172(sp)
 ; ZVFHMIN64-NEXT:    lh s9, 726(sp)
 ; ZVFHMIN64-NEXT:    lh s10, 470(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v29
-; ZVFHMIN64-NEXT:    vmv.x.s a3, v11
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v19
+; ZVFHMIN64-NEXT:    vmv.x.s a3, v27
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s9
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s10
 ; ZVFHMIN64-NEXT:    feq.h s9, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb s9, 171(sp)
 ; ZVFHMIN64-NEXT:    lh s10, 724(sp)
 ; ZVFHMIN64-NEXT:    lh s11, 468(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s a4, v7
-; ZVFHMIN64-NEXT:    vmv.x.s s9, v9
+; ZVFHMIN64-NEXT:    vmv.x.s a4, v13
+; ZVFHMIN64-NEXT:    vmv.x.s s9, v15
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, s10
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, s11
 ; ZVFHMIN64-NEXT:    feq.h s10, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb s10, 170(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 722(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 466(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s10, v21
-; ZVFHMIN64-NEXT:    vmv.x.s s11, v27
+; ZVFHMIN64-NEXT:    vmv.x.s s10, v9
+; ZVFHMIN64-NEXT:    vmv.x.s s11, v11
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -2872,7 +2844,11 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a1, 161(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 610(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 354(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s s6, v5
+; ZVFHMIN64-NEXT:    csrr a2, vlenb
+; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
+; ZVFHMIN64-NEXT:    add a2, sp, a2
+; ZVFHMIN64-NEXT:    lh s6, 800(a2) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s5, v23
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
@@ -2880,27 +2856,15 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    sb a0, 241(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 608(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 352(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 21
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s4, 800(a2) # 8-byte Folded Reload
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 20
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s3, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s4, v31
+; ZVFHMIN64-NEXT:    lh s3, 800(sp) # 8-byte Folded Reload
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 240(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 606(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 350(sp)
-; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 22
-; ZVFHMIN64-NEXT:    mul a2, a2, a3
-; ZVFHMIN64-NEXT:    add a2, sp, a2
-; ZVFHMIN64-NEXT:    lh s2, 800(a2) # 8-byte Folded Reload
+; ZVFHMIN64-NEXT:    vmv.x.s s2, v7
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa3, a1
@@ -3035,7 +2999,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 766(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 510(sp)
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    li a3, 18
+; ZVFHMIN64-NEXT:    li a3, 19
 ; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
@@ -3047,20 +3011,21 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t5, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    sb a0, 191(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 764(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 508(sp)
-; ZVFHMIN64-NEXT:    vmv.x.s t5, v6
 ; ZVFHMIN64-NEXT:    csrr a2, vlenb
-; ZVFHMIN64-NEXT:    slli a2, a2, 2
+; ZVFHMIN64-NEXT:    li a3, 12
+; ZVFHMIN64-NEXT:    mul a2, a2, a3
 ; ZVFHMIN64-NEXT:    add a2, sp, a2
 ; ZVFHMIN64-NEXT:    addi a2, a2, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a2, v8
+; ZVFHMIN64-NEXT:    vmv.x.s t6, v8
+; ZVFHMIN64-NEXT:    vmv.x.s a2, v16
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
@@ -3068,14 +3033,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 762(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 506(sp)
 ; ZVFHMIN64-NEXT:    csrr a3, vlenb
-; ZVFHMIN64-NEXT:    slli a3, a3, 3
+; ZVFHMIN64-NEXT:    li a4, 6
+; ZVFHMIN64-NEXT:    mul a3, a3, a4
 ; ZVFHMIN64-NEXT:    add a3, sp, a3
 ; ZVFHMIN64-NEXT:    addi a3, a3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s a3, v8
 ; ZVFHMIN64-NEXT:    csrr a4, vlenb
-; ZVFHMIN64-NEXT:    li s3, 6
-; ZVFHMIN64-NEXT:    mul a4, a4, s3
+; ZVFHMIN64-NEXT:    slli a4, a4, 2
 ; ZVFHMIN64-NEXT:    add a4, sp, a4
 ; ZVFHMIN64-NEXT:    addi a4, a4, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
@@ -3087,15 +3052,14 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 760(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 504(sp)
 ; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 12
+; ZVFHMIN64-NEXT:    li s4, 10
 ; ZVFHMIN64-NEXT:    mul s3, s3, s4
 ; ZVFHMIN64-NEXT:    add s3, sp, s3
 ; ZVFHMIN64-NEXT:    addi s3, s3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s6, v8
 ; ZVFHMIN64-NEXT:    csrr s3, vlenb
-; ZVFHMIN64-NEXT:    li s4, 10
-; ZVFHMIN64-NEXT:    mul s3, s3, s4
+; ZVFHMIN64-NEXT:    slli s3, s3, 3
 ; ZVFHMIN64-NEXT:    add s3, sp, s3
 ; ZVFHMIN64-NEXT:    addi s3, s3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
@@ -3112,38 +3076,38 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    addi s3, s3, 800
 ; ZVFHMIN64-NEXT:    vl2r.v v8, (s3) # Unknown-size Folded Reload
 ; ZVFHMIN64-NEXT:    vmv.x.s s5, v8
-; ZVFHMIN64-NEXT:    vmv.x.s s3, v16
+; ZVFHMIN64-NEXT:    vmv.x.s s3, v2
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
 ; ZVFHMIN64-NEXT:    sb a0, 187(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 756(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 500(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a2
-; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t3
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t4
 ; ZVFHMIN64-NEXT:    sb a0, 186(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 754(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 498(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a3
-; ZVFHMIN64-NEXT:    feq.h t3, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h t4, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
 ; ZVFHMIN64-NEXT:    sb a0, 185(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 752(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 496(sp)
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a4
-; ZVFHMIN64-NEXT:    feq.h t1, fa5, fa4
+; ZVFHMIN64-NEXT:    feq.h t2, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t2
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t1
 ; ZVFHMIN64-NEXT:    sb a0, 184(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 750(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 494(sp)
@@ -3180,9 +3144,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a5
-; ZVFHMIN64-NEXT:    addi a1, sp, 800
-; ZVFHMIN64-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
+; ZVFHMIN64-NEXT:    vmv.x.s a1, v0
 ; ZVFHMIN64-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 15
 ; ZVFHMIN64-NEXT:    vmv.x.s a5, v8
@@ -3198,9 +3160,9 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    lh a0, 740(sp)
 ; ZVFHMIN64-NEXT:    lh a7, 484(sp)
 ; ZVFHMIN64-NEXT:    sb a2, 140(sp)
-; ZVFHMIN64-NEXT:    sb t1, 141(sp)
-; ZVFHMIN64-NEXT:    sb t3, 142(sp)
-; ZVFHMIN64-NEXT:    sb t4, 143(sp)
+; ZVFHMIN64-NEXT:    sb t2, 141(sp)
+; ZVFHMIN64-NEXT:    sb t4, 142(sp)
+; ZVFHMIN64-NEXT:    sb t3, 143(sp)
 ; ZVFHMIN64-NEXT:    sb a1, 136(sp)
 ; ZVFHMIN64-NEXT:    sb a6, 137(sp)
 ; ZVFHMIN64-NEXT:    sb a4, 138(sp)
@@ -3315,7 +3277,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
 ; ZVFHMIN64-NEXT:    sb a0, 244(sp)
 ; ZVFHMIN64-NEXT:    lh a0, 614(sp)
 ; ZVFHMIN64-NEXT:    lh a1, 358(sp)
@@ -3324,7 +3286,7 @@ define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128
 ; ZVFHMIN64-NEXT:    fmv.h.x fa5, a0
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
 ; ZVFHMIN64-NEXT:    feq.h a0, fa5, fa4
-; ZVFHMIN64-NEXT:    fmv.h.x fa5, t5
+; ZVFHMIN64-NEXT:    fmv.h.x fa5, t6
 ; ZVFHMIN64-NEXT:    vslidedown.vi v8, v24, 8
 ; ZVFHMIN64-NEXT:    vmv.x.s a1, v8
 ; ZVFHMIN64-NEXT:    fmv.h.x fa4, a1
@@ -3944,49 +3906,39 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
+; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB87_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB87_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v7, v8, v16, v0.t
+; CHECK-NEXT:    vmfeq.vv v6, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vslideup.vi v7, v8, 2
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vslideup.vi v6, v8, 2
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index d52c42891fcc3b..cf7f284d7cd085 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -595,47 +595,37 @@ define <256 x i1> @icmp_eq_vv_v256i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1>
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    li a1, 128
 ; CHECK-NEXT:    addi a4, a0, 128
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
 ; CHECK-NEXT:    vlm.v v0, (a2)
 ; CHECK-NEXT:    addi a2, a3, -128
-; CHECK-NEXT:    vle8.v v8, (a4)
+; CHECK-NEXT:    vle8.v v24, (a4)
 ; CHECK-NEXT:    sltu a4, a3, a2
-; CHECK-NEXT:    vle8.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a2, a4, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB51_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 128
 ; CHECK-NEXT:  .LBB51_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmv1r.v v8, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1250,49 +1240,39 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle32.v v16, (a0)
+; CHECK-NEXT:    vle32.v v24, (a0)
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 4
+; CHECK-NEXT:    vslidedown.vi v7, v0, 4
 ; CHECK-NEXT:    bltu a2, a3, .LBB99_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:  .LBB99_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v7, v8, v16, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -32
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v8, v16, v24, v0.t
+; CHECK-NEXT:    vmseq.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v7, v8, 4
-; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vslideup.vi v6, v8, 4
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index 12893ec55cda76..f2fde30a74ab46 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -227,19 +227,18 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 72
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a2, a2, 6
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 72 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 64 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a2, a2, 4
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -249,12 +248,12 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    addi a3, a1, 640
 ; CHECK-NEXT:    addi a4, a7, -64
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v27, v6, 4
+; CHECK-NEXT:    vslidedown.vi v30, v6, 4
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a3)
 ; CHECK-NEXT:    sltu a3, a7, a4
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v0, v27, 2
+; CHECK-NEXT:    vslidedown.vi v0, v30, 2
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and a4, a3, a4
 ; CHECK-NEXT:    addi a3, a4, -32
@@ -268,55 +267,44 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vsetvli zero, a5, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
+; CHECK-NEXT:    slli a5, a5, 3
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a2)
+; CHECK-NEXT:    vle64.v v16, (a2)
 ; CHECK-NEXT:    addi a5, a1, 128
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v26, v7, 4
+; CHECK-NEXT:    vslidedown.vi v28, v7, 4
 ; CHECK-NEXT:    bltu a3, a2, .LBB16_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a3, 16
 ; CHECK-NEXT:  .LBB16_2:
-; CHECK-NEXT:    vmv1r.v v0, v27
+; CHECK-NEXT:    vmv1r.v v0, v30
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v16, (a5)
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 56
-; CHECK-NEXT:    mul a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v8, (a5)
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v27, v26, 2
+; CHECK-NEXT:    vslidedown.vi v29, v28, 2
 ; CHECK-NEXT:    li a5, 64
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v16, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 6
+; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    mul a3, a3, a6
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    mv a6, a7
 ; CHECK-NEXT:    bltu a7, a5, .LBB16_4
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    li a6, 64
 ; CHECK-NEXT:  .LBB16_4:
-; CHECK-NEXT:    vmv1r.v v0, v27
+; CHECK-NEXT:    vmv1r.v v0, v29
 ; CHECK-NEXT:    addi a5, a1, 384
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a1)
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    li t1, 48
-; CHECK-NEXT:    mul t0, t0, t1
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v16, (a1)
 ; CHECK-NEXT:    addi t0, a6, -32
 ; CHECK-NEXT:    sltu a6, a6, t0
 ; CHECK-NEXT:    addi a6, a6, -1
@@ -325,41 +313,24 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    sltu t1, a6, t0
 ; CHECK-NEXT:    addi t1, t1, -1
 ; CHECK-NEXT:    and t0, t1, t0
-; CHECK-NEXT:    csrr t1, vlenb
-; CHECK-NEXT:    li t2, 56
-; CHECK-NEXT:    mul t1, t1, t2
-; CHECK-NEXT:    add t1, sp, t1
-; CHECK-NEXT:    addi t1, t1, 16
-; CHECK-NEXT:    vl8r.v v16, (t1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, t0, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
-; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
-; CHECK-NEXT:    add t0, sp, t0
-; CHECK-NEXT:    addi t0, t0, 16
-; CHECK-NEXT:    vs8r.v v8, (t0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vnsrl.wi v24, v8, 0, v0.t
+; CHECK-NEXT:    addi t0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a6, a2, .LBB16_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    li a6, 16
 ; CHECK-NEXT:  .LBB16_6:
-; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmv1r.v v0, v28
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a5)
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a5)
 ; CHECK-NEXT:    addi a1, a1, 256
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v26, v6, 2
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li t0, 48
-; CHECK-NEXT:    mul a5, a5, t0
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslidedown.vi v12, v6, 2
 ; CHECK-NEXT:    vsetvli zero, a6, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    li a6, 48
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
@@ -375,14 +346,11 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    sltu a5, a5, a1
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    and a1, a5, a1
-; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a5) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a5, 40
-; CHECK-NEXT:    mul a1, a1, a5
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -392,11 +360,11 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB16_10:
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v25, v7, 2
+; CHECK-NEXT:    vslidedown.vi v12, v7, 2
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v8, v16, 0, v0.t
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a4, 48
+; CHECK-NEXT:    li a4, 40
 ; CHECK-NEXT:    mul a1, a1, a4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
@@ -406,71 +374,67 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  # %bb.11:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB16_12:
-; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv4r.v v24, v16
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    addi a4, sp, 16
 ; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 40
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 40
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 6
+; CHECK-NEXT:    li a5, 56
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; CHECK-NEXT:    vslideup.vi v16, v24, 16
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 6
+; CHECK-NEXT:    li a5, 56
+; CHECK-NEXT:    mul a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a4, a1, -16
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    li a6, 48
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vslideup.vi v16, v8, 16
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 56
+; CHECK-NEXT:    li a6, 48
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 48
+; CHECK-NEXT:    li a6, 40
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 40
-; CHECK-NEXT:    mul a5, a5, a6
+; CHECK-NEXT:    slli a5, a5, 5
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
 ; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vslideup.vi v8, v16, 16
 ; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 48
+; CHECK-NEXT:    li a6, 40
 ; CHECK-NEXT:    mul a5, a5, a6
 ; CHECK-NEXT:    add a5, sp, a5
 ; CHECK-NEXT:    addi a5, a5, 16
@@ -479,8 +443,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a1, a1, a4
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    li a5, 24
-; CHECK-NEXT:    mul a4, a4, a5
+; CHECK-NEXT:    slli a4, a4, 4
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
@@ -492,18 +455,19 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:  .LBB16_14:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a2, 24
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a7, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v24, v16, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v16, v24, 0, v0.t
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v24, v8, 16
-; CHECK-NEXT:    vse32.v v24, (a0)
+; CHECK-NEXT:    vslideup.vi v16, v8, 16
+; CHECK-NEXT:    vse32.v v16, (a0)
 ; CHECK-NEXT:    addi a1, a0, 256
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 48
+; CHECK-NEXT:    li a3, 40
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
@@ -511,7 +475,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 56
+; CHECK-NEXT:    li a3, 48
 ; CHECK-NEXT:    mul a2, a2, a3
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
@@ -519,14 +483,14 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
 ; CHECK-NEXT:    vse32.v v8, (a1)
 ; CHECK-NEXT:    addi a0, a0, 384
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 6
+; CHECK-NEXT:    li a2, 56
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vse32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 72
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 6
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
index fa82065f3b4131..68f9f23e6d3095 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vcopysign-vp.ll
@@ -298,46 +298,36 @@ define <32 x double> @vfsgnj_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
+; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnj.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfsgnj.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfsgnj.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
index cad7adbc19f3c8..696ff70d271bb6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmax-vp.ll
@@ -390,46 +390,36 @@ define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
+; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfmax.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
index d8ee7a7044b49c..fdbf00ed75c5a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmin-vp.ll
@@ -390,46 +390,36 @@ define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vle64.v v16, (a0)
+; CHECK-NEXT:    vle64.v v24, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    bltu a2, a1, .LBB26_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
 ; CHECK-NEXT:  .LBB26_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    addi a0, a2, -16
 ; CHECK-NEXT:    sltu a1, a2, a0
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a0, a1, a0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    vfmin.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
index 1d8af4c46cc078..828460539b0e51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll
@@ -205,51 +205,30 @@ define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    sub sp, sp, a2
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 4
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v8
 ; CHECK-NEXT:    li a2, 128
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vle8.v v16, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, a1, 128
 ; CHECK-NEXT:    vle8.v v24, (a0)
+; CHECK-NEXT:    addi a0, a1, 128
+; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v16, (a1)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v9, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v6, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
 ; CHECK-NEXT:    vmv8r.v v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -455,48 +434,27 @@ define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c)
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    addi a1, a0, 128
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a1)
 ; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vi v24, v0, 2
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vslidedown.vi v7, v0, 2
 ; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
index e6dfe5e78cdb4b..b43a4772df0b12 100644
--- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll
@@ -273,12 +273,6 @@ declare <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat>,
 define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -302,11 +296,7 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -332,12 +322,6 @@ define <vscale x 32 x bfloat> @vp_floor_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.floor.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -840,12 +824,6 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_floor_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -869,11 +847,7 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -899,12 +873,6 @@ define <vscale x 32 x half> @vp_floor_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.floor.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1419,12 +1387,6 @@ declare <vscale x 16 x double> @llvm.vp.floor.nxv16f64(<vscale x 16 x double>, <
 define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_floor_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
@@ -1445,40 +1407,27 @@ define <vscale x 16 x double> @vp_floor_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 2
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.floor.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
index a1cdbd4be25794..4dc1b840c2728b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll
@@ -113,12 +113,6 @@ declare <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
 ; CHECK-LABEL: vfmax_nxv16bf16_vv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
@@ -126,20 +120,11 @@ define <vscale x 16 x bfloat> @vfmax_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
 ; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vfmax.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.maximum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
   ret <vscale x 16 x bfloat> %v
@@ -448,12 +433,6 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmax_nxv16f16_vv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
@@ -461,20 +440,11 @@ define <vscale x 16 x half> @vfmax_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.maximum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
   ret <vscale x 16 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
index 33e793a691b81d..4990f31963fd21 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -252,12 +252,6 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmax_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
@@ -265,20 +259,11 @@ define <vscale x 16 x bfloat> @vfmax_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v16, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmax.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.maximum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -292,16 +277,16 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a3, a1, 4
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -309,7 +294,7 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
@@ -323,7 +308,8 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -331,102 +317,96 @@ define <vscale x 32 x bfloat> @vfmax_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    slli a4, a3, 4
+; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a3, a2, 4
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl1r.v v9, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 3
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -921,12 +901,6 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
@@ -934,20 +908,11 @@ define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.maximum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -991,16 +956,16 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a3, a1, 4
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -1008,7 +973,7 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
@@ -1022,7 +987,8 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -1030,102 +996,96 @@ define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl1r.v v9, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a2, a1, 3
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 5
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1578,8 +1538,6 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1606,20 +1564,19 @@ define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v5, v8, v8, v0.t
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmax.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
index d41da7b6a2af96..aa5ebd008e1d99 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll
@@ -113,12 +113,6 @@ declare <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat>, <
 define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) {
 ; CHECK-LABEL: vfmin_nxv16bf16_vv:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
@@ -126,20 +120,11 @@ define <vscale x 16 x bfloat> @vfmin_nxv16bf16_vv(<vscale x 16 x bfloat> %a, <vs
 ; CHECK-NEXT:    vmfeq.vv v0, v24, v24
 ; CHECK-NEXT:    vmfeq.vv v7, v16, v16
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vfmin.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.minimum.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b)
   ret <vscale x 16 x bfloat> %v
@@ -448,12 +433,6 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ;
 ; ZVFHMIN-LABEL: vfmin_nxv16f16_vv:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
@@ -461,20 +440,11 @@ define <vscale x 16 x half> @vfmin_nxv16f16_vv(<vscale x 16 x half> %a, <vscale
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v24, v24
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v16, v16
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.minimum.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b)
   ret <vscale x 16 x half> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
index ef6f33de4ce636..898d248181e200 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -252,12 +252,6 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <v
 define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, i32 zeroext %evl) {
 ; CHECK-LABEL: vfmin_vv_nxv16bf16_unmasked:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
@@ -265,20 +259,11 @@ define <vscale x 16 x bfloat> @vfmin_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat
 ; CHECK-NEXT:    vmfeq.vv v0, v16, v16
 ; CHECK-NEXT:    vmfeq.vv v7, v24, v24
 ; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmerge.vvm v8, v24, v16, v0
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v16, v8, v16
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmin.vv v16, v16, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.minimum.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -292,16 +277,16 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 5
-; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    li a2, 25
+; CHECK-NEXT:    mul a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a3, a1, 4
+; CHECK-NEXT:    add a1, a3, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -309,7 +294,7 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    slli a4, a4, 3
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
@@ -323,7 +308,8 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    slli a4, a3, 3
+; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -331,102 +317,96 @@ define <vscale x 32 x bfloat> @vfmin_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    li a4, 24
-; CHECK-NEXT:    mul a3, a3, a4
+; CHECK-NEXT:    slli a4, a3, 4
+; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vmfeq.vv v13, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v13
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a2, a2, a3
+; CHECK-NEXT:    slli a3, a2, 4
+; CHECK-NEXT:    add a2, a3, a2
 ; CHECK-NEXT:    add a2, sp, a2
 ; CHECK-NEXT:    addi a2, a2, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
+; CHECK-NEXT:    addi a2, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a2, a2, 3
-; CHECK-NEXT:    add a2, sp, a2
-; CHECK-NEXT:    addi a2, a2, 16
-; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB10_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB10_2:
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl1r.v v9, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 3
+; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v9
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 5
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 25
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -921,12 +901,6 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ;
 ; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
@@ -934,20 +908,11 @@ define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
 ; ZVFHMIN-NEXT:    vmfeq.vv v7, v24, v24
 ; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
-; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.minimum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
@@ -991,16 +956,16 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 5
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x19, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 25 * vlenb
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a1, a1, a3
+; ZVFHMIN-NEXT:    slli a3, a1, 4
+; ZVFHMIN-NEXT:    add a1, a3, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
@@ -1008,7 +973,7 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
@@ -1022,7 +987,8 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmfeq.vv v13, v24, v24, v0.t
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a4, a3, 3
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -1030,102 +996,96 @@ define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    li a4, 24
-; ZVFHMIN-NEXT:    mul a3, a3, a4
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vmfeq.vv v13, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v13
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    li a3, 24
-; ZVFHMIN-NEXT:    mul a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 4
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v24, v16, v16, v0.t
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl1r.v v9, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a2, a1, 3
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v8
-; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v9
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 3
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 5
-; ZVFHMIN-NEXT:    add a0, a1, a0
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -1578,8 +1538,6 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
@@ -1606,20 +1564,19 @@ define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    vmv1r.v v0, v6
-; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v5, v8, v8, v0.t
 ; CHECK-NEXT:    vl8re64.v v16, (a0)
-; CHECK-NEXT:    vmv1r.v v0, v26
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfmin.vv v8, v24, v8, v0.t
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
index c7e3c8cb519829..976c66d80d472f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fshr-fshl-vp.ll
@@ -984,19 +984,18 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 48
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 5
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -1005,97 +1004,41 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    sub a5, a4, a1
 ; CHECK-NEXT:    add a6, a2, a3
 ; CHECK-NEXT:    vl8re64.v v8, (a6)
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a6, a4, a5
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
 ; CHECK-NEXT:    srli a6, a1, 3
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    vl8re64.v v16, (a3)
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 4
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a6
 ; CHECK-NEXT:    li a3, 63
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    li a7, 40
-; CHECK-NEXT:    mul a6, a6, a7
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 4
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    slli a5, a5, 3
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
+; CHECK-NEXT:    vand.vx v24, v8, a3, v0.t
+; CHECK-NEXT:    vsrl.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a5, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a5, vlenb
-; CHECK-NEXT:    li a6, 40
-; CHECK-NEXT:    mul a5, a5, a6
-; CHECK-NEXT:    add a5, sp, a5
-; CHECK-NEXT:    addi a5, a5, 16
-; CHECK-NEXT:    vl8r.v v8, (a5) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
+; CHECK-NEXT:    vand.vx v16, v8, a3, v0.t
 ; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 40
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vi v16, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
+; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
@@ -1103,56 +1046,33 @@ define <vscale x 16 x i64> @fshr_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB46_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v16, a3, v0.t
+; CHECK-NEXT:    vand.vx v8, v24, a3, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vnot.v v24, v24, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a3, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v16, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v16, a3, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsll.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsll.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 48
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 5
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1169,151 +1089,77 @@ define <vscale x 16 x i64> @fshl_v16i64(<vscale x 16 x i64> %a, <vscale x 16 x i
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 40
+; CHECK-NEXT:    li a3, 24
 ; CHECK-NEXT:    mul a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 5
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a3, 24
-; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a5, a3, 3
-; CHECK-NEXT:    srli a1, a3, 3
-; CHECK-NEXT:    sub a6, a4, a3
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a5, a1, 3
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    sub a6, a4, a1
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a1
-; CHECK-NEXT:    add a1, a2, a5
-; CHECK-NEXT:    vl8re64.v v8, (a1)
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    sltu a1, a4, a6
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    and a6, a1, a6
-; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
+; CHECK-NEXT:    add a3, a2, a5
+; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    sltu a3, a4, a6
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a6, a3, a6
+; CHECK-NEXT:    li a3, 63
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 5
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 4
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 3
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
+; CHECK-NEXT:    vand.vx v24, v8, a3, v0.t
+; CHECK-NEXT:    vsll.vv v16, v16, v24, v0.t
+; CHECK-NEXT:    addi a6, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a6) # Unknown-size Folded Spill
 ; CHECK-NEXT:    add a5, a0, a5
-; CHECK-NEXT:    addi a6, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vnot.v v8, v8, v0.t
 ; CHECK-NEXT:    vl8re64.v v16, (a5)
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a5, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a5) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8re64.v v8, (a2)
+; CHECK-NEXT:    vand.vx v8, v8, a3, v0.t
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v24, (a2)
 ; CHECK-NEXT:    vsrl.vi v16, v16, 1, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsrl.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    bltu a4, a3, .LBB47_2
+; CHECK-NEXT:    bltu a4, a1, .LBB47_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a4, a3
+; CHECK-NEXT:    mv a4, a1
 ; CHECK-NEXT:  .LBB47_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vand.vx v8, v8, a1, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vand.vx v8, v24, a3, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsll.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a0, a0, a2
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vnot.v v8, v8, v0.t
-; CHECK-NEXT:    vand.vx v16, v8, a1, v0.t
+; CHECK-NEXT:    vsll.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    vnot.v v24, v24, v0.t
+; CHECK-NEXT:    vand.vx v24, v24, a3, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsrl.vi v8, v8, 1, v0.t
-; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vsrl.vv v8, v8, v24, v0.t
 ; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 40
+; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
index 72c251ce985cbf..3540bb5516b537 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll
@@ -1894,57 +1894,26 @@ define void @mscatter_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double
 ; RV64-NEXT:    addi sp, sp, -16
 ; RV64-NEXT:    .cfi_def_cfa_offset 16
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 5
+; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    sub sp, sp, a2
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    li a3, 24
-; RV64-NEXT:    mul a2, a2, a3
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; RV64-NEXT:    addi a2, sp, 16
 ; RV64-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 4
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; RV64-NEXT:    vl8re64.v v8, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT:    vmv8r.v v16, v8
+; RV64-NEXT:    vl8re64.v v24, (a0)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    vl8re64.v v8, (a1)
-; RV64-NEXT:    addi a1, sp, 16
-; RV64-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; RV64-NEXT:    vslidedown.vx v24, v0, a0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 4
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vslidedown.vx v7, v0, a0
 ; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
-; RV64-NEXT:    vmv1r.v v0, v24
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 24
-; RV64-NEXT:    mul a0, a0, a1
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vsoxei64.v v16, (zero), v24, v0.t
+; RV64-NEXT:    vmv1r.v v0, v7
 ; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; RV64-NEXT:    vsoxei64.v v16, (zero), v8, v0.t
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    slli a0, a0, 3
 ; RV64-NEXT:    add sp, sp, a0
 ; RV64-NEXT:    .cfi_def_cfa sp, 16
 ; RV64-NEXT:    addi sp, sp, 16
@@ -2001,35 +1970,20 @@ define void @mscatter_baseidx_nxv16i8_nxv16f64(<vscale x 8 x double> %val0, <vsc
 define void @mscatter_baseidx_nxv16i16_nxv16f64(<vscale x 8 x double> %val0, <vscale x 8 x double> %val1, ptr %base, <vscale x 16 x i16> %idxs, <vscale x 16 x i1> %m) {
 ; RV32-LABEL: mscatter_baseidx_nxv16i16_nxv16f64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -16
-; RV32-NEXT:    .cfi_def_cfa_offset 16
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; RV32-NEXT:    vmv8r.v v16, v8
-; RV32-NEXT:    vl4re16.v v8, (a1)
+; RV32-NEXT:    vmv1r.v v6, v0
+; RV32-NEXT:    vl4re16.v v0, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    srli a1, a1, 3
 ; RV32-NEXT:    vsetvli a2, zero, e8, mf4, ta, ma
-; RV32-NEXT:    vslidedown.vx v7, v0, a1
+; RV32-NEXT:    vslidedown.vx v7, v6, a1
 ; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vsext.vf2 v24, v8
-; RV32-NEXT:    vsll.vi v8, v24, 3
+; RV32-NEXT:    vsext.vf2 v24, v0
+; RV32-NEXT:    vsll.vi v24, v24, 3
+; RV32-NEXT:    vmv1r.v v0, v6
 ; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
-; RV32-NEXT:    vsoxei32.v v16, (a0), v8, v0.t
+; RV32-NEXT:    vsoxei32.v v8, (a0), v24, v0.t
 ; RV32-NEXT:    vmv1r.v v0, v7
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vsoxei32.v v16, (a0), v12, v0.t
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 3
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 16
-; RV32-NEXT:    addi sp, sp, 16
-; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    vsoxei32.v v16, (a0), v28, v0.t
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: mscatter_baseidx_nxv16i16_nxv16f64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
index 2a69dd31118bd8..ad6b28e04025c5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll
@@ -253,12 +253,6 @@ declare <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat>, <
 define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -276,9 +270,6 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v16, v24, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v12
@@ -307,12 +298,6 @@ define <vscale x 32 x bfloat> @vp_rint_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.rint.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -769,12 +754,6 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_rint_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -792,9 +771,6 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vmflt.vf v12, v16, fa5, v0.t
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
@@ -823,12 +799,6 @@ define <vscale x 32 x half> @vp_rint_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.rint.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1297,12 +1267,6 @@ declare <vscale x 16 x double> @llvm.vp.rint.nxv16f64(<vscale x 16 x double>, <v
 define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_rint_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
@@ -1317,9 +1281,6 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
 ; CHECK-NEXT:    vfabs.v v24, v16, v0.t
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vmflt.vf v6, v24, fa5, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v6
@@ -1343,12 +1304,6 @@ define <vscale x 16 x double> @vp_rint_nxv16f64(<vscale x 16 x double> %va, <vsc
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.rint.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
index 8a10e75333ad0a..d5879ea519201b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll
@@ -273,12 +273,6 @@ declare <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat>,
 define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -302,11 +296,7 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -332,12 +322,6 @@ define <vscale x 32 x bfloat> @vp_round_nxv32bf16(<vscale x 32 x bfloat> %va, <v
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.round.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -840,12 +824,6 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vp_round_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -869,11 +847,7 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -899,12 +873,6 @@ define <vscale x 32 x half> @vp_round_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.round.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1419,12 +1387,6 @@ declare <vscale x 16 x double> @llvm.vp.round.nxv16f64(<vscale x 16 x double>, <
 define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_round_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
@@ -1445,40 +1407,27 @@ define <vscale x 16 x double> @vp_round_nxv16f64(<vscale x 16 x double> %va, <vs
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 4
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.round.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
index 4cd909e4b0a637..d746a96596f8d2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll
@@ -273,12 +273,6 @@ declare <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloa
 define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -302,11 +296,7 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -332,12 +322,6 @@ define <vscale x 32 x bfloat> @vp_roundeven_nxv32bf16(<vscale x 32 x bfloat> %va
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.roundeven.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -840,12 +824,6 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ;
 ; ZVFHMIN-LABEL: vp_roundeven_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -869,11 +847,7 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -899,12 +873,6 @@ define <vscale x 32 x half> @vp_roundeven_nxv32f16(<vscale x 32 x half> %va, <vs
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundeven.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1419,12 +1387,6 @@ declare <vscale x 16 x double> @llvm.vp.roundeven.nxv16f64(<vscale x 16 x double
 define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundeven_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
@@ -1445,40 +1407,27 @@ define <vscale x 16 x double> @vp_roundeven_nxv16f64(<vscale x 16 x double> %va,
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 0
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.roundeven.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
index 96c821a76ae84e..e11df1338d8713 100644
--- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll
@@ -273,12 +273,6 @@ declare <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfl
 define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv32bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -302,11 +296,7 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -332,12 +322,6 @@ define <vscale x 32 x bfloat> @vp_roundtozero_nxv32bf16(<vscale x 32 x bfloat> %
 ; CHECK-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.roundtozero.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
@@ -840,12 +824,6 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ;
 ; ZVFHMIN-LABEL: vp_roundtozero_nxv32f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v7, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
@@ -869,11 +847,7 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v16, v24, v0.t
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fsrm a2
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v16, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
@@ -899,12 +873,6 @@ define <vscale x 32 x half> @vp_roundtozero_nxv32f16(<vscale x 32 x half> %va, <
 ; ZVFHMIN-NEXT:    vfsgnj.vv v24, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.roundtozero.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
@@ -1419,12 +1387,6 @@ declare <vscale x 16 x double> @llvm.vp.roundtozero.nxv16f64(<vscale x 16 x doub
 define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vp_roundtozero_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    lui a2, %hi(.LCPI44_0)
@@ -1445,40 +1407,27 @@ define <vscale x 16 x double> @vp_roundtozero_nxv16f64(<vscale x 16 x double> %v
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    fsrm a2
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    bltu a0, a1, .LBB44_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB44_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vfabs.v v16, v8, v0.t
+; CHECK-NEXT:    vfabs.v v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vmflt.vf v7, v16, fa5, v0.t
+; CHECK-NEXT:    vmflt.vf v7, v24, fa5, v0.t
 ; CHECK-NEXT:    fsrmi a0, 1
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vfcvt.x.f.v v16, v8, v0.t
+; CHECK-NEXT:    vfcvt.x.f.v v24, v8, v0.t
 ; CHECK-NEXT:    fsrm a0
-; CHECK-NEXT:    vfcvt.f.x.v v16, v16, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v24, v24, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; CHECK-NEXT:    vfsgnj.vv v8, v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vfsgnj.vv v8, v24, v8, v0.t
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x double> @llvm.vp.roundtozero.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x double> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 6c11e9413525e0..e9fd42566f43a6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -1467,18 +1467,21 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a3, a3, a1
 ; CHECK-NEXT:    slli a1, a1, 2
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 1
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
 ; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a3, a3, a1
 ; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
@@ -1489,7 +1492,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    slli a4, a3, 1
 ; CHECK-NEXT:    add a1, a0, a1
 ; CHECK-NEXT:    sub a6, a2, a5
-; CHECK-NEXT:    vl8re16.v v0, (a1)
+; CHECK-NEXT:    vl8re16.v v16, (a1)
 ; CHECK-NEXT:    sltu a1, a2, a6
 ; CHECK-NEXT:    addi a1, a1, -1
 ; CHECK-NEXT:    and a6, a1, a6
@@ -1500,66 +1503,62 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    srli a1, a3, 1
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    csrr t0, vlenb
-; CHECK-NEXT:    slli t0, t0, 3
+; CHECK-NEXT:    slli t0, t0, 1
 ; CHECK-NEXT:    add t0, sp, t0
 ; CHECK-NEXT:    addi t0, t0, 16
 ; CHECK-NEXT:    vs1r.v v24, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v25, v24, a1
+; CHECK-NEXT:    vslidedown.vx v8, v24, a1
+; CHECK-NEXT:    csrr t0, vlenb
+; CHECK-NEXT:    add t0, sp, t0
+; CHECK-NEXT:    addi t0, t0, 16
+; CHECK-NEXT:    vs1r.v v8, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v25, a3
+; CHECK-NEXT:    vslidedown.vx v8, v8, a3
+; CHECK-NEXT:    addi t0, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (t0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vl8re16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli t0, a0, 5
-; CHECK-NEXT:    add a0, t0, a0
+; CHECK-NEXT:    mv t0, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add t0, t0, a0
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a0, a0, t0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli t0, a0, 3
-; CHECK-NEXT:    add a0, t0, a0
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli t0, a0, 4
+; CHECK-NEXT:    slli t0, a0, 1
 ; CHECK-NEXT:    add a0, t0, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v4, v24, v8, v0.t
 ; CHECK-NEXT:    bltu a6, a4, .LBB85_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a6, a4
 ; CHECK-NEXT:  .LBB85_2:
-; CHECK-NEXT:    vmv1r.v v0, v25
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a7, a0, 3
-; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a7, a0, 4
+; CHECK-NEXT:    slli a7, a0, 1
 ; CHECK-NEXT:    add a0, a7, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; CHECK-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v5, v24, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v6, v24, v8, v0.t
 ; CHECK-NEXT:    add a0, a3, a3
 ; CHECK-NEXT:    bltu a2, a5, .LBB85_4
 ; CHECK-NEXT:  # %bb.3:
@@ -1567,7 +1566,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:  .LBB85_4:
 ; CHECK-NEXT:    sub a5, a2, a4
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a6, a6, 3
+; CHECK-NEXT:    slli a6, a6, 1
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
 ; CHECK-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
@@ -1575,51 +1574,54 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vslidedown.vx v0, v7, a3
 ; CHECK-NEXT:    csrr a6, vlenb
 ; CHECK-NEXT:    mv a7, a6
-; CHECK-NEXT:    slli a6, a6, 3
-; CHECK-NEXT:    add a7, a7, a6
 ; CHECK-NEXT:    slli a6, a6, 1
+; CHECK-NEXT:    add a7, a7, a6
+; CHECK-NEXT:    slli a6, a6, 3
 ; CHECK-NEXT:    add a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a7, a6, 4
-; CHECK-NEXT:    add a6, a7, a6
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a7, a6, 5
-; CHECK-NEXT:    add a6, a7, a6
+; CHECK-NEXT:    mv a7, a6
+; CHECK-NEXT:    slli a6, a6, 1
+; CHECK-NEXT:    add a7, a7, a6
+; CHECK-NEXT:    slli a6, a6, 2
+; CHECK-NEXT:    add a6, a6, a7
 ; CHECK-NEXT:    add a6, sp, a6
 ; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    sltu a6, a2, a5
 ; CHECK-NEXT:    addi a6, a6, -1
 ; CHECK-NEXT:    and a5, a6, a5
-; CHECK-NEXT:    csrr a6, vlenb
-; CHECK-NEXT:    slli a7, a6, 4
-; CHECK-NEXT:    add a6, a7, a6
-; CHECK-NEXT:    add a6, sp, a6
-; CHECK-NEXT:    addi a6, a6, 16
-; CHECK-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; CHECK-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; CHECK-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v5, v6, a3
+; CHECK-NEXT:    vslideup.vx v6, v4, a3
 ; CHECK-NEXT:    bltu a2, a4, .LBB85_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a4
 ; CHECK-NEXT:  .LBB85_6:
 ; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    add a4, a4, a5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a5, a4, 5
-; CHECK-NEXT:    add a4, a5, a4
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a5, a5, a4
+; CHECK-NEXT:    slli a4, a4, 2
+; CHECK-NEXT:    add a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
 ; CHECK-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -1627,16 +1629,18 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64bf16(<vscale x 64 x bfloat> %va, <vs
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v4, a3
+; CHECK-NEXT:    vslideup.vx v8, v5, a3
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v5, a1
+; CHECK-NEXT:    vslideup.vx v8, v6, a1
 ; CHECK-NEXT:    vmv.v.v v0, v8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -3717,14 +3721,11 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    addi sp, sp, -16
 ; ZVFH-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFH-NEXT:    csrr a1, vlenb
-; ZVFH-NEXT:    slli a1, a1, 4
-; ZVFH-NEXT:    sub sp, sp, a1
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFH-NEXT:    vmv1r.v v24, v0
-; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 3
-; ZVFH-NEXT:    add a1, sp, a1
-; ZVFH-NEXT:    addi a1, a1, 16
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    addi a1, sp, 16
 ; ZVFH-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    csrr a3, vlenb
 ; ZVFH-NEXT:    srli a1, a3, 1
@@ -3732,37 +3733,30 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    slli a3, a3, 2
 ; ZVFH-NEXT:    add a4, a0, a4
 ; ZVFH-NEXT:    sub a5, a2, a3
-; ZVFH-NEXT:    vl8re16.v v8, (a4)
+; ZVFH-NEXT:    vl8re16.v v24, (a4)
 ; ZVFH-NEXT:    sltu a4, a2, a5
 ; ZVFH-NEXT:    addi a4, a4, -1
-; ZVFH-NEXT:    vl8re16.v v0, (a0)
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vl8re16.v v8, (a0)
 ; ZVFH-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVFH-NEXT:    vslidedown.vx v0, v24, a1
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFH-NEXT:    and a4, a4, a5
 ; ZVFH-NEXT:    vsetvli zero, a4, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v7, v16, v8, v0.t
+; ZVFH-NEXT:    vmfeq.vv v6, v16, v24, v0.t
 ; ZVFH-NEXT:    bltu a2, a3, .LBB171_2
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a2, a3
 ; ZVFH-NEXT:  .LBB171_2:
-; ZVFH-NEXT:    vmv1r.v v0, v24
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vmv1r.v v0, v7
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
-; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
+; ZVFH-NEXT:    vmfeq.vv v16, v24, v8, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
 ; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFH-NEXT:    vslideup.vx v16, v7, a1
+; ZVFH-NEXT:    vslideup.vx v16, v6, a1
 ; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add sp, sp, a0
 ; ZVFH-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFH-NEXT:    addi sp, sp, 16
@@ -3775,18 +3769,21 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a3, a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x1b, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 27 * vlenb
 ; ZVFHMIN-NEXT:    vmv1r.v v24, v0
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    mv a3, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a3, a3, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a3, a3, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a1, a3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -3797,7 +3794,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    slli a4, a3, 1
 ; ZVFHMIN-NEXT:    add a1, a0, a1
 ; ZVFHMIN-NEXT:    sub a6, a2, a5
-; ZVFHMIN-NEXT:    vl8re16.v v0, (a1)
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a1)
 ; ZVFHMIN-NEXT:    sltu a1, a2, a6
 ; ZVFHMIN-NEXT:    addi a1, a1, -1
 ; ZVFHMIN-NEXT:    and a6, a1, a6
@@ -3808,66 +3805,62 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    srli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    csrr t0, vlenb
-; ZVFHMIN-NEXT:    slli t0, t0, 3
+; ZVFHMIN-NEXT:    slli t0, t0, 1
 ; ZVFHMIN-NEXT:    add t0, sp, t0
 ; ZVFHMIN-NEXT:    addi t0, t0, 16
 ; ZVFHMIN-NEXT:    vs1r.v v24, (t0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v25, v24, a1
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a1
+; ZVFHMIN-NEXT:    csrr t0, vlenb
+; ZVFHMIN-NEXT:    add t0, sp, t0
+; ZVFHMIN-NEXT:    addi t0, t0, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (t0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli t0, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v24, v25, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
+; ZVFHMIN-NEXT:    addi t0, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (t0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 5
-; ZVFHMIN-NEXT:    add a0, t0, a0
+; ZVFHMIN-NEXT:    mv t0, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add t0, t0, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, t0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 3
-; ZVFHMIN-NEXT:    add a0, t0, a0
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli t0, a0, 4
+; ZVFHMIN-NEXT:    slli t0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, t0, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vmv1r.v v0, v24
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a7, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v6, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    bltu a6, a4, .LBB171_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a6, a4
 ; ZVFHMIN-NEXT:  .LBB171_2:
-; ZVFHMIN-NEXT:    vmv1r.v v0, v25
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a7, a0, 3
-; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a7, a0, 4
+; ZVFHMIN-NEXT:    slli a7, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a7, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    vsetvli zero, a6, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v6, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    add a0, a3, a3
 ; ZVFHMIN-NEXT:    bltu a2, a5, .LBB171_4
 ; ZVFHMIN-NEXT:  # %bb.3:
@@ -3875,7 +3868,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  .LBB171_4:
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a6, a6, 3
+; ZVFHMIN-NEXT:    slli a6, a6, 1
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
 ; ZVFHMIN-NEXT:    vl1r.v v7, (a6) # Unknown-size Folded Reload
@@ -3883,51 +3876,54 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
 ; ZVFHMIN-NEXT:    mv a7, a6
-; ZVFHMIN-NEXT:    slli a6, a6, 3
-; ZVFHMIN-NEXT:    add a7, a7, a6
 ; ZVFHMIN-NEXT:    slli a6, a6, 1
+; ZVFHMIN-NEXT:    add a7, a7, a6
+; ZVFHMIN-NEXT:    slli a6, a6, 3
 ; ZVFHMIN-NEXT:    add a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a6, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a6) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 5
-; ZVFHMIN-NEXT:    add a6, a7, a6
+; ZVFHMIN-NEXT:    mv a7, a6
+; ZVFHMIN-NEXT:    slli a6, a6, 1
+; ZVFHMIN-NEXT:    add a7, a7, a6
+; ZVFHMIN-NEXT:    slli a6, a6, 2
+; ZVFHMIN-NEXT:    add a6, a6, a7
 ; ZVFHMIN-NEXT:    add a6, sp, a6
 ; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vl8r.v v16, (a6) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
 ; ZVFHMIN-NEXT:    addi a6, a6, -1
 ; ZVFHMIN-NEXT:    and a5, a6, a5
-; ZVFHMIN-NEXT:    csrr a6, vlenb
-; ZVFHMIN-NEXT:    slli a7, a6, 4
-; ZVFHMIN-NEXT:    add a6, a7, a6
-; ZVFHMIN-NEXT:    add a6, sp, a6
-; ZVFHMIN-NEXT:    addi a6, a6, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a6) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a5, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vmfeq.vv v4, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vmfeq.vv v5, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v5, v6, a3
+; ZVFHMIN-NEXT:    vslideup.vx v6, v4, a3
 ; ZVFHMIN-NEXT:    bltu a2, a4, .LBB171_6
 ; ZVFHMIN-NEXT:  # %bb.5:
 ; ZVFHMIN-NEXT:    mv a2, a4
 ; ZVFHMIN-NEXT:  .LBB171_6:
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a5, a4, 5
-; ZVFHMIN-NEXT:    add a4, a5, a4
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a5, a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 2
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -3935,16 +3931,18 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v4, a3
+; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZVFHMIN-NEXT:    vslideup.vx v8, v5, a1
+; ZVFHMIN-NEXT:    vslideup.vx v8, v6, a1
 ; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a1, a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index e8099c2d08a5f8..d06281eb62a78d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -1089,14 +1089,11 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    vsetvli a4, zero, e8, m8, ta, ma
@@ -1104,33 +1101,26 @@ define <vscale x 128 x i1> @icmp_eq_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    sub a4, a3, a1
-; CHECK-NEXT:    vl8r.v v8, (a2)
+; CHECK-NEXT:    vl8r.v v24, (a2)
 ; CHECK-NEXT:    sltu a2, a3, a4
-; CHECK-NEXT:    vl8r.v v24, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v8, (a0)
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v6, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a3, a1, .LBB96_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
 ; CHECK-NEXT:  .LBB96_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    vmv1r.v v8, v6
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -2241,14 +2231,11 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v7, v0
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    srli a1, a3, 2
@@ -2256,37 +2243,30 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    slli a3, a3, 1
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sub a5, a2, a3
-; CHECK-NEXT:    vl8re32.v v8, (a4)
+; CHECK-NEXT:    vl8re32.v v24, (a4)
 ; CHECK-NEXT:    sltu a4, a2, a5
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a1
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
 ; CHECK-NEXT:    and a4, a4, a5
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v7, v16, v8, v0.t
+; CHECK-NEXT:    vmseq.vv v6, v16, v24, v0.t
 ; CHECK-NEXT:    bltu a2, a3, .LBB189_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a3
 ; CHECK-NEXT:  .LBB189_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
+; CHECK-NEXT:    vmseq.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
 ; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v16, v7, a1
+; CHECK-NEXT:    vslideup.vx v16, v6, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
index 98ec99bcfea33e..a9f10f4ff424fe 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpstore.ll
@@ -609,15 +609,7 @@ declare void @llvm.experimental.vp.strided.store.nxv16f64.p0.i32(<vscale x 16 x
 define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 signext %stride, <vscale x 17 x i1> %mask, i32 zeroext %evl) {
 ; CHECK-LABEL: strided_store_nxv17f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    sub sp, sp, a4
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a4, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a4, vlenb
 ; CHECK-NEXT:    slli a6, a4, 1
 ; CHECK-NEXT:    mv a5, a3
@@ -630,8 +622,8 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a7, a4
 ; CHECK-NEXT:  .LBB48_4:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a7, e64, m8, ta, ma
 ; CHECK-NEXT:    vsse64.v v8, (a1), a2, v0.t
 ; CHECK-NEXT:    sub a0, a5, a4
@@ -639,7 +631,7 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    srli t0, a4, 3
 ; CHECK-NEXT:    sub a6, a3, a6
 ; CHECK-NEXT:    vsetvli t1, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, t0
+; CHECK-NEXT:    vslidedown.vx v0, v7, t0
 ; CHECK-NEXT:    sltu t0, a5, a0
 ; CHECK-NEXT:    add a7, a1, a7
 ; CHECK-NEXT:    sltu a3, a3, a6
@@ -647,10 +639,8 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    addi a3, a3, -1
 ; CHECK-NEXT:    and t0, t0, a0
 ; CHECK-NEXT:    and a0, a3, a6
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, t0, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v8, (a7), a2, v0.t
+; CHECK-NEXT:    vsse64.v v16, (a7), a2, v0.t
 ; CHECK-NEXT:    bltu a0, a4, .LBB48_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a4
@@ -658,16 +648,10 @@ define void @strided_store_nxv17f64(<vscale x 17 x double> %v, ptr %ptr, i32 sig
 ; CHECK-NEXT:    mul a3, a5, a2
 ; CHECK-NEXT:    srli a4, a4, 2
 ; CHECK-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
+; CHECK-NEXT:    vslidedown.vx v0, v7, a4
 ; CHECK-NEXT:    add a1, a1, a3
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vsse64.v v16, (a1), a2, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vsse64.v v24, (a1), a2, v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.experimental.vp.strided.store.nxv17f64.p0.i32(<vscale x 17 x double> %v, ptr %ptr, i32 %stride, <vscale x 17 x i1> %mask, i32 %evl)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index 4338d1f61af728..5da63c070921bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -106,56 +106,43 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    li a2, 24
-; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    slli a1, a1, 4
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vid.v v8
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    vand.vi v8, v8, 1
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vmseq.vi v24, v8, 0
-; CHECK-NEXT:    vl8re64.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmnot.m v6, v24
-; CHECK-NEXT:    vcompress.vm v8, v16, v24
-; CHECK-NEXT:    vmv1r.v v13, v24
-; CHECK-NEXT:    vcompress.vm v24, v16, v6
-; CHECK-NEXT:    vmv1r.v v12, v6
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vcompress.vm v0, v16, v13
+; CHECK-NEXT:    vmseq.vi v16, v8, 0
+; CHECK-NEXT:    vl8re64.v v0, (a0)
+; CHECK-NEXT:    vmnot.m v17, v16
+; CHECK-NEXT:    vcompress.vm v8, v24, v16
+; CHECK-NEXT:    vmv1r.v v12, v16
+; CHECK-NEXT:    vmv1r.v v13, v17
+; CHECK-NEXT:    vcompress.vm v16, v24, v13
+; CHECK-NEXT:    vcompress.vm v24, v0, v12
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vcompress.vm v0, v16, v12
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vcompress.vm v24, v0, v13
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v12, v16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v12, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v28, v16
-; CHECK-NEXT:    vmv8r.v v16, v24
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v20, v24
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 99743066c79a82..4933979b7ec4c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -182,50 +182,39 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vand.vi v24, v16, 1
-; CHECK-NEXT:    vmseq.vi v16, v24, 0
-; CHECK-NEXT:    vcompress.vm v24, v8, v16
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    vmseq.vi v7, v8, 0
+; CHECK-NEXT:    vcompress.vm v8, v24, v7
+; CHECK-NEXT:    vmnot.m v12, v7
+; CHECK-NEXT:    vmv1r.v v13, v7
+; CHECK-NEXT:    vcompress.vm v0, v24, v12
+; CHECK-NEXT:    vcompress.vm v24, v16, v13
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmnot.m v17, v16
-; CHECK-NEXT:    vcompress.vm v0, v8, v17
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vcompress.vm v24, v8, v16
+; CHECK-NEXT:    vcompress.vm v24, v16, v12
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vcompress.vm v24, v8, v17
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v12, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v20, v8
-; CHECK-NEXT:    vmv4r.v v4, v24
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v4, v16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -423,50 +412,39 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT:    vmv8r.v v24, v8
 ; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vid.v v16
-; CHECK-NEXT:    vand.vi v24, v16, 1
-; CHECK-NEXT:    vmseq.vi v16, v24, 0
-; CHECK-NEXT:    vcompress.vm v24, v8, v16
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vand.vi v8, v8, 1
+; CHECK-NEXT:    vmseq.vi v7, v8, 0
+; CHECK-NEXT:    vcompress.vm v8, v24, v7
+; CHECK-NEXT:    vmnot.m v12, v7
+; CHECK-NEXT:    vmv1r.v v13, v7
+; CHECK-NEXT:    vcompress.vm v0, v24, v12
+; CHECK-NEXT:    vcompress.vm v24, v16, v13
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmnot.m v17, v16
-; CHECK-NEXT:    vcompress.vm v0, v8, v17
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vcompress.vm v24, v8, v16
+; CHECK-NEXT:    vcompress.vm v24, v16, v12
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vcompress.vm v24, v8, v17
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v12, v16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vmv4r.v v20, v8
-; CHECK-NEXT:    vmv4r.v v4, v24
-; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    vmv4r.v v4, v16
 ; CHECK-NEXT:    vmv8r.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 1953cfd2a0169f..676f405ec82710 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -514,75 +514,57 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    slli a2, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv4r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB24_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -601,55 +583,41 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v12, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v16, v0
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB25_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB25_2:
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -657,7 +625,7 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1320,75 +1288,57 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    slli a2, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv4r.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB50_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB50_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    slli a1, a0, 3
 ; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -1413,55 +1363,41 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv4r.v v16, v0
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfadd.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB51_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB51_2:
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1469,7 +1405,7 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index ccd286b7ee5fd3..740fc453a4f96c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -476,75 +476,57 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    slli a2, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv4r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -563,55 +545,41 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v12, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v16, v0
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfdiv.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB23_2:
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -619,7 +587,7 @@ define <vscale x 32 x bfloat> @vfdiv_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1232,75 +1200,57 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    slli a2, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv4r.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfdiv.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    slli a1, a0, 3
 ; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -1325,55 +1275,41 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv4r.v v16, v0
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfdiv.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfdiv.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB47_2:
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1381,7 +1317,7 @@ define <vscale x 32 x half> @vfdiv_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index fd518d9be786de..7d3eefd3a29d52 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -473,29 +473,16 @@ declare <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat>, <v
 define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vv_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv4r.v v4, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; CHECK-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fma.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x bfloat> %c, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
@@ -520,32 +507,18 @@ define <vscale x 16 x bfloat> @vfma_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat>
 define <vscale x 16 x bfloat> @vfma_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x bfloat> %vc, <vscale x 16 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vfma_vf_nxv16bf16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv4r.v v16, v12
+; CHECK-NEXT:    vmv4r.v v20, v8
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; CHECK-NEXT:    vmv.v.x v4, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
@@ -628,19 +601,27 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    add a2, a2, a3
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    vl8re16.v v0, (a0)
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a2, a0, 5
+; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
 ; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 4
+; CHECK-NEXT:    slli a4, a4, 3
+; CHECK-NEXT:    mv a5, a4
+; CHECK-NEXT:    slli a4, a4, 1
+; CHECK-NEXT:    add a4, a4, a5
 ; CHECK-NEXT:    add a4, sp, a4
 ; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v24, v24, a2
+; CHECK-NEXT:    vslidedown.vx v24, v0, a2
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    slli a2, a2, 3
 ; CHECK-NEXT:    add a2, sp, a2
@@ -650,41 +631,35 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    mv a4, a3
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a4, a4, a3
-; CHECK-NEXT:    slli a3, a3, 1
-; CHECK-NEXT:    add a3, a3, a4
+; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv4r.v v8, v16
-; CHECK-NEXT:    vmv8r.v v24, v16
+; CHECK-NEXT:    vmv8r.v v0, v16
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 5
-; CHECK-NEXT:    add a3, a4, a3
+; CHECK-NEXT:    mv a4, a3
+; CHECK-NEXT:    slli a3, a3, 3
+; CHECK-NEXT:    add a4, a4, a3
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    add a3, a3, a4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 4
+; CHECK-NEXT:    slli a4, a3, 5
 ; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
 ; CHECK-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
@@ -696,11 +671,7 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    mv a1, a0
 ; CHECK-NEXT:  .LBB30_2:
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    mv a2, a0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a2, a2, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -711,27 +682,36 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 5
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a2, a2, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 4
+; CHECK-NEXT:    slli a2, a0, 5
 ; CHECK-NEXT:    add a0, a2, a0
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 5
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a2, a2, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
@@ -741,8 +721,11 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vs
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a2, a0, 5
-; CHECK-NEXT:    add a0, a2, a0
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a2, a2, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -777,7 +760,8 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    slli a2, a2, 5
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vmv8r.v v0, v16
+; CHECK-NEXT:    vl8re16.v v16, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    mv a2, a0
@@ -785,15 +769,17 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    add a0, a0, a2
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    slli a0, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a1, a0
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v16, v16, a2
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vs1r.v v16, (a2) # Unknown-size Folded Spill
 ; CHECK-NEXT:    sltu a2, a1, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
@@ -804,16 +790,12 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv4r.v v8, v16
-; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a3, a3, 3
 ; CHECK-NEXT:    mv a4, a3
@@ -821,10 +803,10 @@ define <vscale x 32 x bfloat> @vfma_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat>
 ; CHECK-NEXT:    add a3, a3, a4
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
+; CHECK-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v4
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
@@ -2000,29 +1982,16 @@ define <vscale x 16 x half> @vfma_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ;
 ; ZVFHMIN-LABEL: vfma_vv_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v20, v8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fma.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
@@ -2059,32 +2028,18 @@ define <vscale x 16 x half> @vfma_vf_nxv16f16(<vscale x 16 x half> %va, half %b,
 ;
 ; ZVFHMIN-LABEL: vfma_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv4r.v v16, v12
+; ZVFHMIN-NEXT:    vmv4r.v v20, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a2, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv.v.x v12, a1
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vmv.v.x v4, a1
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -2193,19 +2148,27 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vmv1r.v v24, v0
-; ZVFHMIN-NEXT:    vl8re16.v v0, (a0)
+; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a2, a0, 5
+; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v24, v0, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -2215,41 +2178,35 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 5
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    slli a4, a3, 5
 ; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
@@ -2261,11 +2218,7 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB66_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -2276,27 +2229,36 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
+; ZVFHMIN-NEXT:    slli a2, a0, 5
 ; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
@@ -2306,8 +2268,11 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -2349,7 +2314,8 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a2, a0
@@ -2357,15 +2323,17 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    slli a0, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a2, a1, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
@@ -2376,16 +2344,12 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v16
-; ZVFHMIN-NEXT:    vmv8r.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -2393,10 +2357,10 @@ define <vscale x 32 x half> @vfma_vv_nxv32f16_unmasked(<vscale x 32 x half> %va,
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vl8r.v v0, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
@@ -7825,33 +7789,19 @@ define <vscale x 16 x half> @vfnmadd_vv_nxv16f16_commuted(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vv_nxv16f16_commuted:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
@@ -7924,36 +7874,21 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16(<vscale x 16 x half> %va, half
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v4, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v8, v8, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8152,22 +8087,37 @@ define <vscale x 16 x half> @vfnmadd_vf_nxv16f16_neg_splat_commute(<vscale x 16
 ;
 ; ZVFHMIN-LABEL: vfnmadd_vf_nxv16f16_neg_splat_commute:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    vmv4r.v v4, v8
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmv.v.x v16, a1
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v4, v16, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
@@ -8282,33 +8232,19 @@ define <vscale x 16 x half> @vfnmsub_vv_nxv16f16_commuted(<vscale x 16 x half> %
 ;
 ; ZVFHMIN-LABEL: vfnmsub_vv_nxv16f16_commuted:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv4r.v v4, v8
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v12, v12, a1, v0.t
+; ZVFHMIN-NEXT:    vxor.vx v20, v12, a1, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a1, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %negb = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   %negc = call <vscale x 16 x half> @llvm.vp.fneg.nxv16f16(<vscale x 16 x half> %c, <vscale x 16 x i1> %m, i32 %evl)
@@ -8851,17 +8787,10 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
-; ZVFHMIN-NEXT:    add a2, a2, a3
+; ZVFHMIN-NEXT:    slli a3, a2, 5
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
@@ -8871,7 +8800,12 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v20, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v7, v7, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v7, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
@@ -8880,43 +8814,41 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    mv a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
-; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    slli a4, a2, 4
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a4, a2, 3
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a4, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv1r.v v0, v20
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a4, a2, 3
+; ZVFHMIN-NEXT:    add a2, a4, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
@@ -8925,7 +8857,8 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a3, a2, 3
+; ZVFHMIN-NEXT:    add a2, a3, a2
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -8934,22 +8867,25 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB281_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    slli a2, a0, 4
+; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a2, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a2, a0, 3
+; ZVFHMIN-NEXT:    add a0, a2, a0
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -8959,10 +8895,8 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a1, a0, 5
+; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -9020,21 +8954,23 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB282_2:
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -9042,19 +8978,16 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
@@ -9067,11 +9000,14 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
 ; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
@@ -9095,16 +9031,13 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16(<vscale x 32 x half> %va, half %
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
@@ -9158,11 +9091,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -9175,7 +9104,11 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB283_2:
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -9194,13 +9127,8 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -9208,7 +9136,12 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_commute(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
 ; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -9303,12 +9236,12 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv.v.x v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -9316,7 +9249,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
+; ZVFHMIN-NEXT:    vmv4r.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -9325,7 +9258,8 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -9406,12 +9340,12 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a1, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
@@ -9420,7 +9354,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    slli a4, a4, 2
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
@@ -9435,17 +9369,20 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v7, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a5, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v8, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -9453,18 +9390,27 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
@@ -9482,7 +9428,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
@@ -9493,7 +9442,7 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -9506,7 +9455,10 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -9515,9 +9467,9 @@ define <vscale x 32 x half> @vfmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -9819,124 +9771,115 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
+; ZVFHMIN-NEXT:    csrr a5, vlenb
+; ZVFHMIN-NEXT:    slli a5, a5, 3
+; ZVFHMIN-NEXT:    mv a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 2
+; ZVFHMIN-NEXT:    add a5, a5, a6
+; ZVFHMIN-NEXT:    add a5, sp, a5
+; ZVFHMIN-NEXT:    addi a5, a5, 16
+; ZVFHMIN-NEXT:    vl1r.v v16, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
-; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    mv a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 3
-; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 1
-; ZVFHMIN-NEXT:    add a5, a5, a6
-; ZVFHMIN-NEXT:    add a5, sp, a5
-; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
@@ -9945,18 +9888,16 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB288_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -9965,10 +9906,9 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -9994,109 +9934,101 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB289_2
@@ -10104,18 +10036,16 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB289_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -10125,10 +10055,9 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -10174,8 +10103,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a4, v0.t
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a4, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
-; ZVFHMIN-NEXT:    addi a4, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
@@ -10186,7 +10114,8 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 4
@@ -10223,10 +10152,8 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -10235,7 +10162,9 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
@@ -10456,17 +10385,9 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
@@ -10481,17 +10402,17 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v24, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
@@ -10500,13 +10421,13 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -10581,17 +10502,17 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v24, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a4, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a2
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
@@ -10599,24 +10520,32 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v24, v16, a2
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl1r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
-; ZVFHMIN-NEXT:    sltu a3, a0, a2
-; ZVFHMIN-NEXT:    addi a3, a3, -1
-; ZVFHMIN-NEXT:    and a2, a3, a2
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v16, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a0, a2
+; ZVFHMIN-NEXT:    addi a3, a3, -1
+; ZVFHMIN-NEXT:    and a2, a3, a2
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
@@ -10625,13 +10554,13 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    addi a2, sp, 16
@@ -11000,28 +10929,39 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v24
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    vmv.v.x v0, a1
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a4, a1, 5
-; ZVFHMIN-NEXT:    add a1, a4, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a5, a4, 5
+; ZVFHMIN-NEXT:    add a4, a5, a4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl1r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
@@ -11029,64 +10969,69 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 5
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a3, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
@@ -11096,18 +11041,18 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB296_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -11147,15 +11092,13 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
@@ -11163,26 +11106,25 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a4, a1, 5
-; ZVFHMIN-NEXT:    add a1, a4, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a4, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
@@ -11190,66 +11132,61 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    mv a4, a3
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a4, a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 2
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 5
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 4
-; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB297_2
@@ -11257,18 +11194,16 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB297_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    mv a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a2, a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
-; ZVFHMIN-NEXT:    add a1, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -11278,10 +11213,9 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -11584,124 +11518,115 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
+; ZVFHMIN-NEXT:    lui a2, 8
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v7
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a2
+; ZVFHMIN-NEXT:    mv a3, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    add a0, a0, a3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    lui a2, 8
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vs1r.v v7, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
+; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
-; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    csrr a5, vlenb
-; ZVFHMIN-NEXT:    mv a6, a5
 ; ZVFHMIN-NEXT:    slli a5, a5, 3
-; ZVFHMIN-NEXT:    add a6, a6, a5
-; ZVFHMIN-NEXT:    slli a5, a5, 1
+; ZVFHMIN-NEXT:    mv a6, a5
+; ZVFHMIN-NEXT:    slli a5, a5, 2
 ; ZVFHMIN-NEXT:    add a5, a5, a6
 ; ZVFHMIN-NEXT:    add a5, sp, a5
 ; ZVFHMIN-NEXT:    addi a5, a5, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl1r.v v16, (a5) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v16, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v16, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v16, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v24
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
@@ -11710,18 +11635,16 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB300_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 5
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -11730,10 +11653,9 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -11759,109 +11681,101 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    mv a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a3, a3, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
 ; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    sub sp, sp, a2
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vxor.vx v16, v16, a2
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a4, a0, 5
-; ZVFHMIN-NEXT:    add a0, a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    mv a4, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    add a0, a0, a4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a0, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a1, a0
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a1, a4
 ; ZVFHMIN-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    mv a4, a2
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
-; ZVFHMIN-NEXT:    add a4, a4, a2
-; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    mv a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 2
 ; ZVFHMIN-NEXT:    add a2, a2, a4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 5
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a4, a2, 4
-; ZVFHMIN-NEXT:    add a2, a4, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 5
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v8, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a3, a2, 4
-; ZVFHMIN-NEXT:    add a2, a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    slli a2, a2, 5
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v28, v8
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a1, a0, .LBB301_2
@@ -11869,18 +11783,16 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a1, a0
 ; ZVFHMIN-NEXT:  .LBB301_2:
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    mv a2, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a2, a2, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
+; ZVFHMIN-NEXT:    mv a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a2
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a2, a0, 4
-; ZVFHMIN-NEXT:    add a0, a2, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
@@ -11890,10 +11802,9 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16_unmasked_commuted(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v24, v8
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add a1, a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -11918,12 +11829,12 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v24, v16
 ; ZVFHMIN-NEXT:    fmv.x.h a2, fa0
 ; ZVFHMIN-NEXT:    lui a3, 8
@@ -11947,7 +11858,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    slli a4, a4, 2
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
@@ -11962,17 +11873,25 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v28
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    slli a4, a4, 2
 ; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
@@ -11996,25 +11915,28 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    slli a3, a3, 2
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
 ; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
@@ -12023,7 +11945,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
@@ -12044,9 +11966,9 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16(<vscale x 32 x half> %va, half
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -12093,11 +12015,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB303_2
@@ -12110,7 +12028,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v28
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
 ; ZVFHMIN-NEXT:    mv a5, a4
@@ -12129,21 +12051,21 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -12153,26 +12075,16 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
@@ -12237,12 +12149,12 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v16, v7, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv.v.x v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
@@ -12250,7 +12162,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v8, v24
+; ZVFHMIN-NEXT:    vmv4r.v v8, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -12259,7 +12171,8 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -12364,22 +12277,22 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v16, v8, a1
+; ZVFHMIN-NEXT:    vxor.vx v24, v8, a1
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    sub a4, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a5, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v20, v7, a3
 ; ZVFHMIN-NEXT:    sltu a3, a0, a4
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a3, a3, a4
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v24, a2
+; ZVFHMIN-NEXT:    vmv.v.x v0, a2
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
-; ZVFHMIN-NEXT:    vmv4r.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
+; ZVFHMIN-NEXT:    vmv4r.v v16, v0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    mv a4, a2
@@ -12388,12 +12301,13 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_unmasked_commute(<vscale x 32 x
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v20
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 4
 ; ZVFHMIN-NEXT:    add a2, sp, a2
@@ -12481,12 +12395,15 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
@@ -12499,15 +12416,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    vxor.vx v24, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    vmv4r.v v20, v28
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v28
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
 ; ZVFHMIN-NEXT:    bltu a0, a2, .LBB306_2
@@ -12515,56 +12424,54 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    mv a3, a2
 ; ZVFHMIN-NEXT:  .LBB306_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
+; ZVFHMIN-NEXT:    slli a4, a4, 5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v12
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    sub a2, a0, a2
+; ZVFHMIN-NEXT:    srli a1, a1, 2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    slli a3, a3, 5
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
-; ZVFHMIN-NEXT:    sub a2, a0, a2
-; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
@@ -12572,7 +12479,8 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    sltu a0, a0, a2
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a1
@@ -12580,18 +12488,15 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
 ; ZVFHMIN-NEXT:    and a0, a0, a2
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -12627,14 +12532,14 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
@@ -12647,15 +12552,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    vxor.vx v16, v16, a3, v0.t
 ; ZVFHMIN-NEXT:    slli a2, a1, 1
 ; ZVFHMIN-NEXT:    mv a3, a0
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 4
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
@@ -12665,32 +12566,28 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:  .LBB307_2:
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
+; ZVFHMIN-NEXT:    mv a5, a4
+; ZVFHMIN-NEXT:    slli a4, a4, 1
+; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a4) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv4r.v v4, v20
 ; ZVFHMIN-NEXT:    addi a4, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 5
+; ZVFHMIN-NEXT:    slli a4, a4, 4
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a4, vlenb
 ; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    mv a5, a4
-; ZVFHMIN-NEXT:    slli a4, a4, 1
-; ZVFHMIN-NEXT:    add a4, a4, a5
 ; ZVFHMIN-NEXT:    add a4, sp, a4
 ; ZVFHMIN-NEXT:    addi a4, a4, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a4) # Unknown-size Folded Reload
@@ -12698,27 +12595,28 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv4r.v v28, v4
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a2
 ; ZVFHMIN-NEXT:    srli a1, a1, 2
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
@@ -12728,7 +12626,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    addi a0, a0, -1
 ; ZVFHMIN-NEXT:    and a0, a0, a2
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -12736,9 +12634,6 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24, v0.t
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 1
-; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -12746,10 +12641,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
-; ZVFHMIN-NEXT:    add a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -12774,77 +12666,99 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
 ; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x29, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 41 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vxor.vx v0, v24, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    slli a5, a4, 5
+; ZVFHMIN-NEXT:    add a4, a5, a4
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vl1r.v v24, (a4) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v24, v24, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
-; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    slli a4, a3, 5
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a4, a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
-; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a4, a3, 4
+; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB308_2
@@ -12852,27 +12766,31 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB308_2:
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
-; ZVFHMIN-NEXT:    add a1, a1, a2
+; ZVFHMIN-NEXT:    slli a2, a1, 5
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a2, a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
@@ -12882,8 +12800,9 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v0
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    mv a1, a0
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a1, a1, a0
 ; ZVFHMIN-NEXT:    slli a0, a0, 2
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
@@ -12910,82 +12829,96 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 2
+; ZVFHMIN-NEXT:    slli a1, a1, 1
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 48 * vlenb
+; ZVFHMIN-NEXT:    vmv8r.v v0, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    lui a2, 8
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
+; ZVFHMIN-NEXT:    vmset.m v8
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    vmv.v.x v24, a1
 ; ZVFHMIN-NEXT:    slli a1, a3, 1
 ; ZVFHMIN-NEXT:    srli a3, a3, 2
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v24, v24, a2
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sub a2, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a3
+; ZVFHMIN-NEXT:    vslidedown.vx v8, v8, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    sltu a3, a0, a2
 ; ZVFHMIN-NEXT:    addi a3, a3, -1
 ; ZVFHMIN-NEXT:    and a2, a3, a2
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    mv a4, a3
-; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    slli a3, a3, 2
 ; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v0, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v4
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 4
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    mv a4, a3
+; ZVFHMIN-NEXT:    slli a3, a3, 1
+; ZVFHMIN-NEXT:    add a3, a3, a4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vl1r.v v0, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 5
+; ZVFHMIN-NEXT:    slli a3, a3, 4
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
 ; ZVFHMIN-NEXT:    slli a3, a3, 3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a3) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    addi a3, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v20, v8
-; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    slli a2, a2, 5
-; ZVFHMIN-NEXT:    add a2, sp, a2
-; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    addi a2, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    mv a3, a2
+; ZVFHMIN-NEXT:    slli a2, a2, 1
+; ZVFHMIN-NEXT:    add a2, a2, a3
 ; ZVFHMIN-NEXT:    add a2, sp, a2
 ; ZVFHMIN-NEXT:    addi a2, a2, 16
 ; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
@@ -12996,20 +12929,23 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    mv a2, a1
-; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    slli a1, a1, 2
 ; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 5
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    mv a2, a1
+; ZVFHMIN-NEXT:    slli a1, a1, 1
+; ZVFHMIN-NEXT:    add a1, a1, a2
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
@@ -13019,9 +12955,9 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute(<vsc
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v16, v24
 ; ZVFHMIN-NEXT:    vmv8r.v v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    mv a1, a0
-; ZVFHMIN-NEXT:    slli a0, a0, 2
+; ZVFHMIN-NEXT:    slli a0, a0, 1
 ; ZVFHMIN-NEXT:    add a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
index 1d471ab2404b17..24ca0469f3073f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-constrained-sdnode.ll
@@ -228,66 +228,69 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; CHECK-NEXT:    sub sp, sp, a1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; CHECK-NEXT:    vmv8r.v v0, v16
-; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv8r.v v16, v8
-; CHECK-NEXT:    vl8re16.v v8, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; CHECK-NEXT:    vl8re16.v v16, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfmadd.vv v0, v8, v24
+; CHECK-NEXT:    vfmadd.vv v0, v16, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 24
-; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 24
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmadd.vv v16, v8, v24
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -665,66 +668,69 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
index 88fd81a5a2f7bc..b78c4aad2fc4d5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmadd-sdnode.ll
@@ -227,17 +227,12 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    sub sp, sp, a1
 ; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFH-NEXT:    vmv8r.v v0, v16
-; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    add a1, sp, a1
+; ZVFH-NEXT:    addi a1, a1, 16
 ; ZVFH-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vmv8r.v v16, v8
-; ZVFH-NEXT:    vl8re16.v v8, (a0)
-; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
-; ZVFH-NEXT:    add a0, sp, a0
-; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT:    vl8re16.v v16, (a0)
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
@@ -245,14 +240,17 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
@@ -260,33 +258,30 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vv v0, v8, v24
+; ZVFH-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 3
-; ZVFH-NEXT:    mv a1, a0
-; ZVFH-NEXT:    slli a0, a0, 1
-; ZVFH-NEXT:    add a0, a0, a1
+; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    addi a0, sp, 16
-; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 3
 ; ZVFH-NEXT:    mv a1, a0
@@ -295,6 +290,12 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFH-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -317,66 +318,69 @@ define <vscale x 32 x bfloat> @vfmadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -399,22 +403,23 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    addi sp, sp, -16
 ; ZVFH-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 5
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 2
+; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    sub sp, sp, a0
-; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFH-NEXT:    vmv8r.v v0, v16
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vmv8r.v v16, v8
 ; ZVFH-NEXT:    fmv.x.h a0, fa0
 ; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 4
 ; ZVFH-NEXT:    add a1, sp, a1
 ; ZVFH-NEXT:    addi a1, a1, 16
 ; ZVFH-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; ZVFH-NEXT:    csrr a1, vlenb
 ; ZVFH-NEXT:    slli a1, a1, 3
 ; ZVFH-NEXT:    add a1, sp, a1
@@ -439,7 +444,12 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 5
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add a0, sp, a0
@@ -450,16 +460,20 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 5
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vv v8, v24, v0
-; ZVFH-NEXT:    vmv.v.v v24, v8
-; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; ZVFH-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 5
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; ZVFH-NEXT:    addi a0, sp, 16
 ; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v20
@@ -472,18 +486,21 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFH-NEXT:    addi a0, a0, 16
 ; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    slli a0, a0, 5
 ; ZVFH-NEXT:    add a0, sp, a0
 ; ZVFH-NEXT:    addi a0, a0, 16
-; ZVFH-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFH-NEXT:    vfmadd.vv v16, v8, v0
+; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
-; ZVFH-NEXT:    slli a0, a0, 5
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    mv a1, a0
+; ZVFH-NEXT:    slli a0, a0, 2
+; ZVFH-NEXT:    add a0, a0, a1
 ; ZVFH-NEXT:    add sp, sp, a0
 ; ZVFH-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFH-NEXT:    addi sp, sp, 16
@@ -495,22 +512,21 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 40
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
@@ -531,7 +547,12 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -542,16 +563,20 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v0
-; ZVFHMIN-NEXT:    vmv.v.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v20
@@ -562,18 +587,19 @@ define <vscale x 32 x bfloat> @vfmadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, <
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 40
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
@@ -876,66 +902,69 @@ define <vscale x 32 x half> @vfmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
 ; ZVFHMIN-NEXT:    vmv8r.v v0, v16
-; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
-; ZVFHMIN-NEXT:    vl8re16.v v8, (a0)
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
-; ZVFHMIN-NEXT:    add a0, sp, a0
-; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vl8re16.v v16, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v0, v8, v24
+; ZVFHMIN-NEXT:    vfmadd.vv v0, v16, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    li a1, 24
-; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v28
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 24
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
@@ -964,22 +993,21 @@ define <vscale x 32 x half> @vfmadd_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 40
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v0, v16
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
 ; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 4
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
 ; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
@@ -1000,7 +1028,12 @@ define <vscale x 32 x half> @vfmadd_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v0
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
@@ -1011,16 +1044,20 @@ define <vscale x 32 x half> @vfmadd_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v8, v24, v0
-; ZVFHMIN-NEXT:    vmv.v.v v24, v8
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v24, v0
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
@@ -1031,18 +1068,19 @@ define <vscale x 32 x half> @vfmadd_vf_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v24
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 5
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
-; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v16, v8, v0
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v24
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    li a1, 40
+; ZVFHMIN-NEXT:    mul a0, a0, a1
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
index 978347fa4fc10a..f6769601f488ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmsub-constrained-sdnode.ll
@@ -343,32 +343,37 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
 ; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vl8re16.v v24, (a0)
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    lui a0, 8
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v16
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v0, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
 ; ZVFHMIN-NEXT:    vxor.vx v8, v8, a0
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v0, v24
 ; ZVFHMIN-NEXT:    addi a0, sp, 16
 ; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmadd.vv v0, v24, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
@@ -378,7 +383,7 @@ define <vscale x 32 x half> @vfmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    slli a0, a0, 4
 ; ZVFHMIN-NEXT:    add a0, sp, a0
 ; ZVFHMIN-NEXT:    addi a0, a0, 16
 ; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index f4a236df4c9e4f..1f5a17496b067a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -610,75 +610,57 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    slli a2, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv4r.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB22_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmul.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    slli a1, a0, 3
 ; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -703,55 +685,41 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv4r.v v16, v0
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmul.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfmul.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -759,7 +727,7 @@ define <vscale x 32 x half> @vfmul_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
index 874813f0575953..29f85344787aa5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptrunc-vp.ll
@@ -96,15 +96,8 @@ declare <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f64.nxv16f32(<vscale x 16 x
 define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double> %a, <vscale x 16 x i1> %m, i32 zeroext %vl) {
 ; CHECK-LABEL: vfptrunc_nxv16f32_nxv16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv8r.v v24, v16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 3
 ; CHECK-NEXT:    sub a3, a0, a1
@@ -113,8 +106,6 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v20, v24, v0.t
 ; CHECK-NEXT:    bltu a0, a1, .LBB7_2
@@ -125,12 +116,6 @@ define <vscale x 16 x float> @vfptrunc_nxv16f32_nxv16f64(<vscale x 16 x double>
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
 ; CHECK-NEXT:    vfncvt.f.f.w v16, v8, v0.t
 ; CHECK-NEXT:    vmv8r.v v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f64.nxv16f32(<vscale x 16 x double> %a, <vscale x 16 x i1> %m, i32 %vl)
   ret <vscale x 16 x float> %v
@@ -196,9 +181,9 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v28, v8, v0.t
+; CHECK-NEXT:    vfncvt.f.f.w v12, v24, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB8_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
@@ -208,10 +193,9 @@ define <vscale x 32 x float> @vfptrunc_nxv32f32_nxv32f64(<vscale x 32 x double>
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vfncvt.f.f.w v24, v8, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vfncvt.f.f.w v8, v24, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index 25a80e66c4a527..9bff9a13056864 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -476,75 +476,57 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 4
+; CHECK-NEXT:    slli a2, a1, 3
 ; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; CHECK-NEXT:    vmv1r.v v24, v0
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a3, a1, 3
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    csrr a4, vlenb
-; CHECK-NEXT:    slli a4, a4, 3
-; CHECK-NEXT:    add a4, sp, a4
-; CHECK-NEXT:    addi a4, a4, 16
-; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    vslidedown.vx v12, v24, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    vmv4r.v v24, v0
 ; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a4, a3, 3
-; CHECK-NEXT:    add a3, a4, a3
 ; CHECK-NEXT:    add a3, sp, a3
 ; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v4
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB22_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a2, a1, 3
-; CHECK-NEXT:    add a1, a2, a1
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 4
+; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
@@ -563,55 +545,41 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vmv8r.v v16, v8
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; CHECK-NEXT:    fmv.x.h a1, fa0
 ; CHECK-NEXT:    csrr a2, vlenb
 ; CHECK-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v7
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmset.m v16
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v20
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv.v.x v0, a1
 ; CHECK-NEXT:    slli a1, a2, 1
 ; CHECK-NEXT:    srli a2, a2, 2
 ; CHECK-NEXT:    sub a3, a0, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v7, a2
+; CHECK-NEXT:    vslidedown.vx v12, v16, a2
 ; CHECK-NEXT:    sltu a2, a0, a3
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv4r.v v16, v0
+; CHECK-NEXT:    addi a3, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v4
+; CHECK-NEXT:    vmv1r.v v0, v12
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vfsub.vv v16, v8, v16, v0.t
+; CHECK-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; CHECK-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
 ; CHECK-NEXT:    bltu a0, a1, .LBB23_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a0, a1
 ; CHECK-NEXT:  .LBB23_2:
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
 ; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
 ; CHECK-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -619,7 +587,7 @@ define <vscale x 32 x bfloat> @vfsub_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -1232,75 +1200,57 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 4
+; ZVFHMIN-NEXT:    slli a2, a1, 3
 ; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x11, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 17 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v24, v0
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a3, a1, 3
-; ZVFHMIN-NEXT:    add a1, a3, a1
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
-; ZVFHMIN-NEXT:    csrr a4, vlenb
-; ZVFHMIN-NEXT:    slli a4, a4, 3
-; ZVFHMIN-NEXT:    add a4, sp, a4
-; ZVFHMIN-NEXT:    addi a4, a4, 16
-; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a4, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v24, (a4) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v24, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    vmv4r.v v24, v0
 ; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a4, a3, 3
-; ZVFHMIN-NEXT:    add a3, a4, a3
 ; ZVFHMIN-NEXT:    add a3, sp, a3
 ; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB46_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB46_2:
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a2, a1, 3
-; ZVFHMIN-NEXT:    add a1, a2, a1
 ; ZVFHMIN-NEXT:    add a1, sp, a1
 ; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    addi a1, sp, 16
 ; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfsub.vv v16, v16, v24, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a1, a0, 4
+; ZVFHMIN-NEXT:    slli a1, a0, 3
 ; ZVFHMIN-NEXT:    add a0, a1, a0
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
@@ -1325,55 +1275,41 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, -16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
 ; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    slli a1, a1, 3
 ; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
 ; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
 ; ZVFHMIN-NEXT:    csrr a2, vlenb
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e8, m4, ta, ma
-; ZVFHMIN-NEXT:    vmset.m v7
-; ZVFHMIN-NEXT:    addi a3, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmset.m v16
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v20
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; ZVFHMIN-NEXT:    vmv.v.x v16, a1
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv.v.x v0, a1
 ; ZVFHMIN-NEXT:    slli a1, a2, 1
 ; ZVFHMIN-NEXT:    srli a2, a2, 2
 ; ZVFHMIN-NEXT:    sub a3, a0, a1
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; ZVFHMIN-NEXT:    vslidedown.vx v0, v7, a2
+; ZVFHMIN-NEXT:    vslidedown.vx v12, v16, a2
 ; ZVFHMIN-NEXT:    sltu a2, a0, a3
 ; ZVFHMIN-NEXT:    addi a2, a2, -1
 ; ZVFHMIN-NEXT:    and a2, a2, a3
-; ZVFHMIN-NEXT:    csrr a3, vlenb
-; ZVFHMIN-NEXT:    slli a3, a3, 3
-; ZVFHMIN-NEXT:    add a3, sp, a3
-; ZVFHMIN-NEXT:    addi a3, a3, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv4r.v v16, v0
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v28
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfsub.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vfsub.vv v16, v24, v16, v0.t
 ; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
 ; ZVFHMIN-NEXT:    bltu a0, a1, .LBB47_2
 ; ZVFHMIN-NEXT:  # %bb.1:
 ; ZVFHMIN-NEXT:    mv a0, a1
 ; ZVFHMIN-NEXT:  .LBB47_2:
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
 ; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    add a1, sp, a1
-; ZVFHMIN-NEXT:    addi a1, a1, 16
 ; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v0
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
@@ -1381,7 +1317,7 @@ define <vscale x 32 x half> @vfsub_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    slli a0, a0, 3
 ; ZVFHMIN-NEXT:    add sp, sp, a0
 ; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
index 6cd3884f029fdd..1b343788e03f8a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwmacc-vp.ll
@@ -633,28 +633,14 @@ define <vscale x 16 x float> @vfmacc_vv_nxv16f32(<vscale x 16 x half> %a, <vscal
 ;
 ; ZVFHMIN-LABEL: vfmacc_vv_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> %m, i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
index 0a0bc6696a9f96..9e5da5a4effaf5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmacc-vp.ll
@@ -590,28 +590,14 @@ define <vscale x 16 x float> @vfnmacc_vv_nxv16f32(<vscale x 16 x half> %a, <vsca
 ;
 ; ZVFHMIN-LABEL: vfnmacc_vv_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmadd.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfnmadd.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> %m, i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
index b5f7ef3380869f..c44fc1aa269621 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfwnmsac-vp.ll
@@ -566,28 +566,14 @@ define <vscale x 16 x float> @vfnmsac_vv_nxv16f32(<vscale x 16 x half> %a, <vsca
 ;
 ; ZVFHMIN-LABEL: vfnmsac_vv_nxv16f32:
 ; ZVFHMIN:       # %bb.0:
-; ZVFHMIN-NEXT:    addi sp, sp, -16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
-; ZVFHMIN-NEXT:    csrr a1, vlenb
-; ZVFHMIN-NEXT:    slli a1, a1, 3
-; ZVFHMIN-NEXT:    sub sp, sp, a1
-; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; ZVFHMIN-NEXT:    addi a1, sp, 16
-; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv4r.v v4, v12
+; ZVFHMIN-NEXT:    vmv4r.v v24, v8
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
-; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
-; ZVFHMIN-NEXT:    addi a0, sp, 16
-; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v4, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVFHMIN-NEXT:    vfnmsub.vv v24, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfnmsub.vv v24, v8, v16, v0.t
 ; ZVFHMIN-NEXT:    vmv.v.v v8, v24
-; ZVFHMIN-NEXT:    csrr a0, vlenb
-; ZVFHMIN-NEXT:    slli a0, a0, 3
-; ZVFHMIN-NEXT:    add sp, sp, a0
-; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
-; ZVFHMIN-NEXT:    addi sp, sp, 16
-; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
   %aext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %a, <vscale x 16 x i1> %m, i32 %evl)
   %bext = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
index 8978dc268d4e52..0a2da7a3780444 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpstore.ll
@@ -462,15 +462,7 @@ declare void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double>, ptr, <vscale x 1
 define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 zeroext %evl) {
 ; CHECK-LABEL: vpstore_nxv17f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    sub sp, sp, a3
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vmv1r.v v24, v0
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 1
 ; CHECK-NEXT:    mv a5, a2
@@ -483,14 +475,14 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:  # %bb.3:
 ; CHECK-NEXT:    mv a6, a3
 ; CHECK-NEXT:  .LBB36_4:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a6, e64, m8, ta, ma
 ; CHECK-NEXT:    vse64.v v8, (a1), v0.t
 ; CHECK-NEXT:    sub a0, a5, a3
 ; CHECK-NEXT:    srli a6, a3, 3
 ; CHECK-NEXT:    vsetvli a7, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a6
+; CHECK-NEXT:    vslidedown.vx v0, v7, a6
 ; CHECK-NEXT:    slli a6, a3, 3
 ; CHECK-NEXT:    sub a4, a2, a4
 ; CHECK-NEXT:    sltu a5, a5, a0
@@ -500,10 +492,8 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    addi a2, a2, -1
 ; CHECK-NEXT:    and a5, a5, a0
 ; CHECK-NEXT:    and a0, a2, a4
-; CHECK-NEXT:    addi a2, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a5, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v8, (a6), v0.t
+; CHECK-NEXT:    vse64.v v16, (a6), v0.t
 ; CHECK-NEXT:    bltu a0, a3, .LBB36_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a0, a3
@@ -511,16 +501,10 @@ define void @vpstore_nxv17f64(<vscale x 17 x double> %val, ptr %ptr, <vscale x 1
 ; CHECK-NEXT:    slli a2, a3, 4
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    vslidedown.vx v0, v7, a3
 ; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT:    vse64.v v16, (a1), v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    vse64.v v24, (a1), v0.t
 ; CHECK-NEXT:    ret
   call void @llvm.vp.store.nxv17f64.p0(<vscale x 17 x double> %val, ptr %ptr, <vscale x 17 x i1> %m, i32 %evl)
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
index 1fc33dc73a27dc..6ef6d5ff37b52c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll
@@ -514,19 +514,20 @@ define void @vselect_legalize_regression(<vscale x 16 x double> %a, <vscale x 16
 ; CHECK-LABEL: vselect_legalize_regression:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vlm.v v7, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    srli a2, a0, 3
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    vmand.mm v7, v0, v24
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmand.mm v7, v0, v7
 ; CHECK-NEXT:    vsetvli a3, zero, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v7, a2
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
-; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vmv.v.i v24, 0
 ; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
 ; CHECK-NEXT:    vs8r.v v8, (a1)
 ; CHECK-NEXT:    vs8r.v v16, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index bb51f0592dc17a..107817cefe3e9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -354,48 +354,38 @@ define <vscale x 32 x i32> @select_nxv32i32(<vscale x 32 x i1> %a, <vscale x 32
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a3, vlenb
 ; CHECK-NEXT:    slli a4, a3, 3
 ; CHECK-NEXT:    slli a1, a3, 1
 ; CHECK-NEXT:    srli a3, a3, 2
 ; CHECK-NEXT:    add a4, a0, a4
 ; CHECK-NEXT:    sub a5, a2, a1
-; CHECK-NEXT:    vl8re32.v v8, (a4)
+; CHECK-NEXT:    vl8re32.v v24, (a4)
 ; CHECK-NEXT:    sltu a4, a2, a5
 ; CHECK-NEXT:    addi a4, a4, -1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    and a4, a4, a5
 ; CHECK-NEXT:    vsetvli zero, a4, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    bltu a2, a1, .LBB27_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB27_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -413,48 +403,38 @@ define <vscale x 32 x i32> @select_evl_nxv32i32(<vscale x 32 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    slli a2, a1, 1
 ; CHECK-NEXT:    srli a4, a1, 2
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    sub a5, a1, a2
-; CHECK-NEXT:    vl8re32.v v8, (a3)
+; CHECK-NEXT:    vl8re32.v v24, (a3)
 ; CHECK-NEXT:    sltu a3, a1, a5
 ; CHECK-NEXT:    addi a3, a3, -1
-; CHECK-NEXT:    vl8re32.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re32.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a4
+; CHECK-NEXT:    vslidedown.vx v0, v0, a4
 ; CHECK-NEXT:    and a3, a3, a5
 ; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    bltu a1, a2, .LBB28_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a2
 ; CHECK-NEXT:  .LBB28_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -702,47 +682,37 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vmv1r.v v7, v0
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a3, a1, 3
 ; CHECK-NEXT:    sub a4, a2, a1
 ; CHECK-NEXT:    add a3, a0, a3
 ; CHECK-NEXT:    sltu a5, a2, a4
-; CHECK-NEXT:    vl8re64.v v8, (a3)
+; CHECK-NEXT:    vl8re64.v v24, (a3)
 ; CHECK-NEXT:    addi a5, a5, -1
 ; CHECK-NEXT:    srli a3, a1, 3
-; CHECK-NEXT:    vl8re64.v v0, (a0)
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vl8re64.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a3
+; CHECK-NEXT:    vslidedown.vx v0, v0, a3
 ; CHECK-NEXT:    and a4, a5, a4
 ; CHECK-NEXT:    vsetvli zero, a4, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
 ; CHECK-NEXT:    bltu a2, a1, .LBB48_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
 ; CHECK-NEXT:  .LBB48_2:
-; CHECK-NEXT:    vmv1r.v v0, v24
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
-; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
index e62b7a00396388..525788d3e2fe06 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vtrunc-vp.ll
@@ -337,9 +337,9 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    and a0, a3, a0
 ; CHECK-NEXT:    vmv1r.v v0, v6
 ; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a3) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v28, v8, 0, v0.t
+; CHECK-NEXT:    vnsrl.wi v12, v24, 0, v0.t
 ; CHECK-NEXT:    bltu a2, a1, .LBB17_6
 ; CHECK-NEXT:  # %bb.5:
 ; CHECK-NEXT:    mv a2, a1
@@ -349,10 +349,9 @@ define <vscale x 32 x i32> @vtrunc_nxv32i64_nxv32i32(<vscale x 32 x i64> %a, <vs
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m4, ta, ma
-; CHECK-NEXT:    vnsrl.wi v24, v8, 0, v0.t
-; CHECK-NEXT:    vmv8r.v v8, v24
+; CHECK-NEXT:    vnsrl.wi v8, v24, 0, v0.t
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0

>From 4806951eca46fe8f90047e533ddbd8ea061ea838 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Fri, 6 Dec 2024 13:23:09 +0800
Subject: [PATCH 2/2] Apply to all the targets

---
 .../include/llvm/CodeGen/TargetRegisterInfo.h |    4 -
 llvm/lib/CodeGen/CalcSpillWeights.cpp         |    4 +-
 llvm/lib/CodeGen/TargetRegisterInfo.cpp       |    5 -
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |    5 -
 llvm/lib/Target/RISCV/RISCVRegisterInfo.h     |    3 -
 llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll |   12 +-
 .../AMDGPU/greedy-global-heuristic.mir        |   39 +-
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |    8 +-
 .../identical-subrange-spill-infloop.ll       |  447 +-
 ...nfloop-subrange-spill-inspect-subrange.mir |   28 +-
 .../CodeGen/AMDGPU/infloop-subrange-spill.mir |   27 +-
 llvm/test/CodeGen/AMDGPU/issue48473.mir       |    2 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |   64 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |   14 +
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |   21 +-
 .../ra-inserted-scalar-instructions.mir       |  280 +-
 ...-unsatisfiable-overlapping-tuple-hints.mir |   16 +-
 llvm/test/CodeGen/AMDGPU/remat-smrd.mir       |   48 +-
 .../scc-clobbered-sgpr-to-vmem-spill.ll       |  330 +-
 .../AMDGPU/snippet-copy-bundle-regression.mir |   35 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   |  140 +-
 .../test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll |   10 +-
 .../AMDGPU/splitkit-copy-live-lanes.mir       |  388 +-
 .../CodeGen/AVR/inline-asm/inline-asm3.ll     |   36 +-
 .../CodeGen/Hexagon/packetize-impdef-1.ll     |    4 +-
 .../CodeGen/Hexagon/regalloc-bad-undef.mir    |   64 +-
 llvm/test/CodeGen/PowerPC/p10-spill-creq.ll   |   21 +-
 llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll   |   36 +-
 .../SystemZ/cond-move-regalloc-hints.mir      |    3 +-
 .../CodeGen/SystemZ/fmuladd-soft-float.ll     |   25 +-
 llvm/test/CodeGen/SystemZ/int-conv-03.ll      |    2 +-
 llvm/test/CodeGen/SystemZ/int-conv-04.ll      |    2 +-
 llvm/test/CodeGen/SystemZ/int-conv-07.ll      |    2 +-
 llvm/test/CodeGen/SystemZ/int-conv-08.ll      |    2 +-
 ...ve-complex-deinterleaving-uniform-cases.ll |   29 +-
 llvm/test/CodeGen/Thumb2/mve-vst3.ll          |  422 +-
 llvm/test/CodeGen/Thumb2/mve-vst4.ll          |  104 +-
 llvm/test/CodeGen/X86/abds-neg.ll             |   19 +-
 llvm/test/CodeGen/X86/abdu-neg.ll             |   19 +-
 .../CodeGen/X86/amx_transpose_intrinsics.ll   |   36 +-
 llvm/test/CodeGen/X86/apx/mul-i1024.ll        |  398 +-
 llvm/test/CodeGen/X86/bitreverse.ll           |   74 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |  384 +-
 .../div-rem-pair-recomposition-unsigned.ll    |  144 +-
 llvm/test/CodeGen/X86/extract-bits.ll         |   57 +-
 llvm/test/CodeGen/X86/fold-tied-op.ll         |   72 +-
 .../test/CodeGen/X86/fp128-libcalls-strict.ll |   12 +-
 .../test/CodeGen/X86/fptosi-sat-vector-128.ll |   36 +-
 llvm/test/CodeGen/X86/fshr.ll                 |   55 +-
 .../X86/merge-consecutive-loads-128.ll        |    6 +
 llvm/test/CodeGen/X86/mul-i1024.ll            | 3838 ++++----
 llvm/test/CodeGen/X86/mul-i256.ll             |  138 +-
 llvm/test/CodeGen/X86/mul-i512.ll             |  998 +--
 llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll |   36 +-
 llvm/test/CodeGen/X86/scmp.ll                 |  229 +-
 llvm/test/CodeGen/X86/sdiv_fix.ll             |   56 +-
 llvm/test/CodeGen/X86/sdiv_fix_sat.ll         |  169 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |   47 +-
 llvm/test/CodeGen/X86/smul-with-overflow.ll   |  640 +-
 llvm/test/CodeGen/X86/smul_fix.ll             |   12 +-
 llvm/test/CodeGen/X86/smul_fix_sat.ll         |   76 +-
 .../X86/smulo-128-legalisation-lowering.ll    |  496 +-
 llvm/test/CodeGen/X86/sse-regcall4.ll         |    8 +-
 llvm/test/CodeGen/X86/sshl_sat_vec.ll         |   33 +-
 llvm/test/CodeGen/X86/statepoint-live-in.ll   |    4 +-
 llvm/test/CodeGen/X86/statepoint-regs.ll      |    4 +-
 .../statepoint-vreg-unlimited-tied-opnds.ll   |  135 +-
 .../subvectorwise-store-of-vector-splat.ll    |  181 +-
 llvm/test/CodeGen/X86/ucmp.ll                 |  943 +-
 llvm/test/CodeGen/X86/umul-with-overflow.ll   |   80 +-
 llvm/test/CodeGen/X86/umul_fix.ll             |   18 +-
 llvm/test/CodeGen/X86/umul_fix_sat.ll         |   39 +-
 .../X86/umulo-64-legalisation-lowering.ll     |   19 +-
 ...unfold-masked-merge-vector-variablemask.ll | 2358 +++--
 llvm/test/CodeGen/X86/ushl_sat_vec.ll         |   91 +-
 ...lar-shift-by-byte-multiple-legalization.ll | 7696 ++++++++---------
 .../X86/wide-scalar-shift-legalization.ll     | 2157 +++--
 ...ad-of-small-alloca-with-zero-upper-half.ll |  196 +-
 llvm/test/CodeGen/X86/xmulo.ll                |  142 +-
 llvm/test/DebugInfo/COFF/fpo-csrs.ll          |   30 +-
 80 files changed, 12150 insertions(+), 12713 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 7dd272fed996d1..292fa3c94969be 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -926,10 +926,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
   /// Returns a -1 terminated array of pressure set IDs.
   virtual const int *getRegUnitPressureSets(unsigned RegUnit) const = 0;
 
-  /// Get the factor of spill weight for this register class.
-  virtual unsigned
-  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const;
-
   /// Get a list of 'hint' registers that the register allocator should try
   /// first when allocating a physical register for the virtual register
   /// VirtReg. These registers are effectively moved to the front of the
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 3bc0159fefd02b..5a1b8abb3a49b0 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -188,7 +188,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
   // Do not update future local split artifacts.
   bool ShouldUpdateLI = !IsLocalSplitArtifact;
 
-  unsigned Factor = TRI.getSpillWeightScaleFactor(MRI.getRegClass(LI.reg()));
+  // We will scale the weight by the register weight of register class.
+  unsigned Factor =
+      TRI.getRegClassWeight((MRI.getRegClass(LI.reg()))).RegWeight;
   if (IsLocalSplitArtifact) {
     MachineBasicBlock *LocalMBB = LIS.getMBBFromIndex(*End);
     assert(LocalMBB == LIS.getMBBFromIndex(*Start) &&
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 93c59cb134d8c0..032f1a33e75c43 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -414,11 +414,6 @@ bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
   return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
 }
 
-unsigned TargetRegisterInfo::getSpillWeightScaleFactor(
-    const TargetRegisterClass *RC) const {
-  return 1;
-}
-
 // Compute target-independent register allocator hints to help eliminate copies.
 bool TargetRegisterInfo::getRegAllocationHints(
     Register VirtReg, ArrayRef<MCPhysReg> Order,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 9e02f1ecc60cde..cfcc3119257f65 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -805,11 +805,6 @@ RISCVRegisterInfo::getRegisterCostTableIndex(const MachineFunction &MF) const {
              : 0;
 }
 
-unsigned RISCVRegisterInfo::getSpillWeightScaleFactor(
-    const TargetRegisterClass *RC) const {
-  return getRegClassWeight(RC).RegWeight;
-}
-
 // Add two address hints to improve chances of being able to use a compressed
 // instruction.
 bool RISCVRegisterInfo::getRegAllocationHints(
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 9b4317873fec61..3ab79694e175c8 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -127,9 +127,6 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   unsigned getRegisterCostTableIndex(const MachineFunction &MF) const override;
 
-  unsigned
-  getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override;
-
   bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
                              const MachineFunction &MF, const VirtRegMap *VRM,
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
index c3694f3b92fb42..ae1779c3661f39 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll
@@ -113,13 +113,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: test:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-NEXT:    s_load_dword vcc_lo, s[4:5], 0x8
+; GCN-NEXT:    s_load_dwordx2 vcc, s[4:5], 0x0
+; GCN-NEXT:    s_load_dword s0, s[4:5], 0x8
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    ; kill: killed $sgpr4_sgpr5
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_writelane_b32 v0, s0, 0
-; GCN-NEXT:    v_writelane_b32 v0, s1, 1
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -128,6 +127,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
@@ -150,9 +150,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    v_readlane_b32 s0, v0, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, vcc_lo
-; GCN-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
@@ -216,7 +214,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    global_store_dword v2, v1, s[0:1]
+; GCN-NEXT:    global_store_dword v2, v1, vcc
 ; GCN-NEXT:    s_endpgm
   call void asm sideeffect "", "~{s[0:7]}" ()
   call void asm sideeffect "", "~{s[8:15]}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir b/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir
index 6f1e5b89db8841..f69ba5499d2511 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-global-heuristic.mir
@@ -56,7 +56,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit-def %0
   ; CHECK-NEXT:   S_NOP 0, implicit-def %18
   ; CHECK-NEXT:   SI_SPILL_V128_SAVE %18, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-  ; CHECK-NEXT:   S_NOP 0, implicit-def %35
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %36
   ; CHECK-NEXT:   S_NOP 0, implicit-def %27
   ; CHECK-NEXT:   S_NOP 0, implicit-def %29
   ; CHECK-NEXT:   S_NOP 0, implicit-def %31
@@ -67,8 +67,8 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit %31
   ; CHECK-NEXT:   S_NOP 0, implicit %29
   ; CHECK-NEXT:   S_NOP 0, implicit %27
-  ; CHECK-NEXT:   S_NOP 0, implicit %35
-  ; CHECK-NEXT:   SI_SPILL_V128_SAVE %35, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit %36
+  ; CHECK-NEXT:   SI_SPILL_V128_SAVE %36, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE]]
   ; CHECK-NEXT:   S_NOP 0, implicit %0
@@ -81,8 +81,8 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   S_NOP 0, implicit-def %33
-  ; CHECK-NEXT:   SI_SPILL_V128_SAVE %33, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %32
+  ; CHECK-NEXT:   SI_SPILL_V128_SAVE %32, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit %10
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
@@ -94,9 +94,10 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
-  ; CHECK-NEXT:   S_NOP 0, implicit-def %40
-  ; CHECK-NEXT:   SI_SPILL_V128_SAVE %40, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-  ; CHECK-NEXT:   S_NOP 0, implicit %33
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %41
+  ; CHECK-NEXT:   SI_SPILL_V128_SAVE %41, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE1]]
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
@@ -113,7 +114,7 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0, implicit-def %42
   ; CHECK-NEXT:   SI_SPILL_V128_SAVE %42, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-  ; CHECK-NEXT:   S_NOP 0, implicit %40
+  ; CHECK-NEXT:   S_NOP 0, implicit %41
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT:   S_NOP 0
@@ -139,23 +140,23 @@ body:             |
   ; CHECK-NEXT:   S_NOP 0, implicit %29
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_128 = COPY %27
   ; CHECK-NEXT:   S_NOP 0, implicit %27
-  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]]
-  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE1]]
-  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE2]]
-  ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE3]]
-  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit %0
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE4]]
-  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE5]]
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE6]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   S_NOP 0, implicit %0
-  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE6]]
+  ; CHECK-NEXT:   [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V128_RESTORE7]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[COPY3]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[COPY2]]
   ; CHECK-NEXT:   S_NOP 0, implicit [[COPY1]]
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 8c017fa5ec2636..ed2d04fc0e0fbb 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -47,10 +47,10 @@ entry:
 }
 
 ; CHECK:   .name:       num_spilled_sgprs
-; GFX700:   .sgpr_spill_count: 10
-; GFX803:   .sgpr_spill_count: 10
-; GFX900:   .sgpr_spill_count: 62
-; GFX1010:  .sgpr_spill_count: 60
+; GFX700:   .sgpr_spill_count: 13
+; GFX803:   .sgpr_spill_count: 13
+; GFX900:   .sgpr_spill_count: 57
+; GFX1010:  .sgpr_spill_count: 56
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
     ptr addrspace(1) %out0, ptr addrspace(1) %out1, [8 x i32],
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 5dff660912e402..e5418a379f6bc3 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -8,158 +8,145 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_writelane_b32 v5, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v5, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s40, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s41, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s42, 8
-; CHECK-NEXT:    v_writelane_b32 v5, s43, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s44, 10
-; CHECK-NEXT:    v_writelane_b32 v5, s45, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s46, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s47, 13
-; CHECK-NEXT:    v_writelane_b32 v5, s48, 14
-; CHECK-NEXT:    v_writelane_b32 v5, s49, 15
+; CHECK-NEXT:    v_writelane_b32 v5, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v5, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v5, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v5, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v5, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v5, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v5, s45, 13
+; CHECK-NEXT:    v_writelane_b32 v5, s46, 14
+; CHECK-NEXT:    v_writelane_b32 v5, s47, 15
+; CHECK-NEXT:    v_writelane_b32 v5, s48, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s49, 17
 ; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v5, s50, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s50, 18
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf0
 ; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s51, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s51, 19
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s20, 0x130
 ; CHECK-NEXT:    s_mov_b32 s21, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v7, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v7, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v7, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v7, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v7, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v7, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v7, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v7, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v7, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v7, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v7, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v6, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v6, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v6, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v6, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v6, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v6, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v6, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v6, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v6, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v6, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v6, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[20:21], 0x0
-; CHECK-NEXT:    v_writelane_b32 v7, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v7, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v6, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v6, s48, 12
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_writelane_b32 v7, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v6, s49, 13
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s28
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v7, s50, 14
-; CHECK-NEXT:    v_writelane_b32 v7, s51, 15
+; CHECK-NEXT:    v_writelane_b32 v6, s50, 14
+; CHECK-NEXT:    v_writelane_b32 v6, s51, 15
 ; CHECK-NEXT:    image_sample_lz v3, v[2:3], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v7, s4, 16
-; CHECK-NEXT:    v_writelane_b32 v7, s5, 17
-; CHECK-NEXT:    v_writelane_b32 v7, s6, 18
-; CHECK-NEXT:    v_writelane_b32 v7, s7, 19
-; CHECK-NEXT:    v_writelane_b32 v7, s8, 20
-; CHECK-NEXT:    v_writelane_b32 v7, s9, 21
+; CHECK-NEXT:    v_writelane_b32 v6, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s52, 20
+; CHECK-NEXT:    v_writelane_b32 v6, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s53, 21
+; CHECK-NEXT:    v_writelane_b32 v6, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v5, s54, 22
 ; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v7, s10, 22
-; CHECK-NEXT:    v_writelane_b32 v7, s11, 23
-; CHECK-NEXT:    v_writelane_b32 v7, s12, 24
-; CHECK-NEXT:    v_writelane_b32 v7, s13, 25
-; CHECK-NEXT:    v_writelane_b32 v7, s14, 26
-; CHECK-NEXT:    v_writelane_b32 v7, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v7, s16, 28
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v7, s17, 29
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v7, s18, 30
-; CHECK-NEXT:    s_mov_b32 s26, 48
-; CHECK-NEXT:    s_mov_b32 s27, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 21
-; CHECK-NEXT:    v_writelane_b32 v7, s19, 31
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[26:27], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v5, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v5, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v5, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v5, s60, 26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v7, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v5, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v7, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v5, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v7, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v5, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v7, s7, 35
-; CHECK-NEXT:    v_writelane_b32 v5, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v7, s8, 36
-; CHECK-NEXT:    v_writelane_b32 v5, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v7, s9, 37
-; CHECK-NEXT:    v_writelane_b32 v5, s66, 32
-; CHECK-NEXT:    s_movk_i32 s28, 0x1f0
-; CHECK-NEXT:    s_movk_i32 s30, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s29, s24
-; CHECK-NEXT:    s_mov_b32 s31, s24
-; CHECK-NEXT:    v_writelane_b32 v7, s10, 38
-; CHECK-NEXT:    v_writelane_b32 v5, s67, 33
-; CHECK-NEXT:    v_writelane_b32 v7, s11, 39
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[30:31], 0x0
+; CHECK-NEXT:    v_writelane_b32 v6, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v5, s55, 23
+; CHECK-NEXT:    v_writelane_b32 v6, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v5, s56, 24
+; CHECK-NEXT:    v_writelane_b32 v6, s9, 21
+; CHECK-NEXT:    v_writelane_b32 v5, s57, 25
+; CHECK-NEXT:    v_writelane_b32 v6, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v5, s58, 26
+; CHECK-NEXT:    v_writelane_b32 v6, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v5, s59, 27
+; CHECK-NEXT:    v_writelane_b32 v6, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v5, s60, 28
+; CHECK-NEXT:    v_writelane_b32 v6, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v5, s61, 29
+; CHECK-NEXT:    v_writelane_b32 v6, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v5, s62, 30
+; CHECK-NEXT:    v_writelane_b32 v6, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v5, s63, 31
+; CHECK-NEXT:    v_writelane_b32 v6, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v5, s64, 32
+; CHECK-NEXT:    v_writelane_b32 v6, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v5, s65, 33
+; CHECK-NEXT:    v_writelane_b32 v6, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v5, s66, 34
+; CHECK-NEXT:    v_writelane_b32 v6, s19, 31
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_movk_i32 s8, 0x2f0
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_mov_b32 s9, s24
+; CHECK-NEXT:    v_writelane_b32 v5, s67, 35
+; CHECK-NEXT:    s_movk_i32 s6, 0x1f0
+; CHECK-NEXT:    s_mov_b32 s7, s24
+; CHECK-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[8:9], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
+; CHECK-NEXT:    v_writelane_b32 v5, s68, 36
+; CHECK-NEXT:    s_xor_b64 s[34:35], vcc, -1
+; CHECK-NEXT:    v_writelane_b32 v5, s69, 37
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v4, v3
-; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
-; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
+; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], s[34:35]
+; CHECK-NEXT:    s_xor_b64 s[68:69], exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
+; CHECK-NEXT:    v_readlane_b32 s4, v6, 0
+; CHECK-NEXT:    v_readlane_b32 s12, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s13, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s14, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s15, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s16, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s17, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s18, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s19, v6, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
-; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s5, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s6, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s7, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s8, v6, 4
+; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[12:19], s[20:23] dmask:0x1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
+; CHECK-NEXT:    v_readlane_b32 s9, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s10, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s11, v6, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 32
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 34
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 35
-; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[60:67], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v4
 ; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
@@ -168,158 +155,77 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s12, v7, 32
-; CHECK-NEXT:    v_readlane_b32 s13, v7, 33
-; CHECK-NEXT:    v_readlane_b32 s14, v7, 34
-; CHECK-NEXT:    v_readlane_b32 s15, v7, 35
-; CHECK-NEXT:    v_readlane_b32 s16, v7, 36
-; CHECK-NEXT:    v_readlane_b32 s17, v7, 37
-; CHECK-NEXT:    v_readlane_b32 s18, v7, 38
-; CHECK-NEXT:    v_readlane_b32 s19, v7, 39
-; CHECK-NEXT:    v_writelane_b32 v7, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v7, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v7, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v7, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v7, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v7, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v7, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v7, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v7, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v7, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v7, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v7, s15, 51
-; CHECK-NEXT:    v_writelane_b32 v7, s16, 52
-; CHECK-NEXT:    v_writelane_b32 v7, s17, 53
-; CHECK-NEXT:    v_writelane_b32 v7, s18, 54
-; CHECK-NEXT:    v_writelane_b32 v7, s19, 55
-; CHECK-NEXT:    ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v7, s52, 56
-; CHECK-NEXT:    v_writelane_b32 v6, s60, 0
-; CHECK-NEXT:    v_writelane_b32 v7, s53, 57
-; CHECK-NEXT:    v_writelane_b32 v6, s61, 1
-; CHECK-NEXT:    v_writelane_b32 v7, s54, 58
-; CHECK-NEXT:    v_writelane_b32 v6, s62, 2
-; CHECK-NEXT:    v_writelane_b32 v7, s55, 59
-; CHECK-NEXT:    v_writelane_b32 v6, s63, 3
-; CHECK-NEXT:    v_writelane_b32 v7, s56, 60
-; CHECK-NEXT:    v_writelane_b32 v6, s64, 4
-; CHECK-NEXT:    v_writelane_b32 v7, s57, 61
-; CHECK-NEXT:    v_writelane_b32 v6, s65, 5
-; CHECK-NEXT:    v_writelane_b32 v7, s58, 62
-; CHECK-NEXT:    v_writelane_b32 v6, s66, 6
-; CHECK-NEXT:    v_writelane_b32 v7, s59, 63
-; CHECK-NEXT:    v_writelane_b32 v6, s67, 7
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT:    s_mov_b64 s[46:47], s[26:27]
+; CHECK-NEXT:    s_mov_b64 s[44:45], s[24:25]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[28:29], s[68:69]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
-; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[8:9]
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[34:35]
+; CHECK-NEXT:    s_xor_b64 s[4:5], exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 0
+; CHECK-NEXT:    v_readlane_b32 s12, v6, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 7
-; CHECK-NEXT:    v_readlane_b32 s44, v7, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v7, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v7, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v7, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v7, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v7, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v7, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v7, 15
-; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 16
-; CHECK-NEXT:    v_readlane_b32 s44, v7, 24
-; CHECK-NEXT:    v_readlane_b32 s45, v7, 25
-; CHECK-NEXT:    v_readlane_b32 s46, v7, 26
-; CHECK-NEXT:    v_readlane_b32 s47, v7, 27
-; CHECK-NEXT:    v_readlane_b32 s48, v7, 28
-; CHECK-NEXT:    v_readlane_b32 s49, v7, 29
-; CHECK-NEXT:    v_readlane_b32 s50, v7, 30
-; CHECK-NEXT:    v_readlane_b32 s51, v7, 31
+; CHECK-NEXT:    v_readlane_b32 s13, v6, 1
+; CHECK-NEXT:    v_readlane_b32 s14, v6, 2
+; CHECK-NEXT:    v_readlane_b32 s15, v6, 3
+; CHECK-NEXT:    v_readlane_b32 s16, v6, 4
+; CHECK-NEXT:    v_readlane_b32 s17, v6, 5
+; CHECK-NEXT:    v_readlane_b32 s18, v6, 6
+; CHECK-NEXT:    v_readlane_b32 s19, v6, 7
+; CHECK-NEXT:    v_readlane_b32 s20, v6, 8
+; CHECK-NEXT:    v_readlane_b32 s21, v6, 9
+; CHECK-NEXT:    v_readlane_b32 s22, v6, 10
+; CHECK-NEXT:    v_readlane_b32 s23, v6, 11
+; CHECK-NEXT:    v_readlane_b32 s24, v6, 12
+; CHECK-NEXT:    v_readlane_b32 s25, v6, 13
+; CHECK-NEXT:    v_readlane_b32 s26, v6, 14
+; CHECK-NEXT:    v_readlane_b32 s27, v6, 15
+; CHECK-NEXT:    image_sample_lz v2, v[0:1], s[12:19], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s12, v6, 16
+; CHECK-NEXT:    v_readlane_b32 s20, v6, 24
+; CHECK-NEXT:    v_readlane_b32 s21, v6, 25
+; CHECK-NEXT:    v_readlane_b32 s22, v6, 26
+; CHECK-NEXT:    v_readlane_b32 s23, v6, 27
+; CHECK-NEXT:    v_readlane_b32 s24, v6, 28
+; CHECK-NEXT:    v_readlane_b32 s25, v6, 29
+; CHECK-NEXT:    v_readlane_b32 s26, v6, 30
+; CHECK-NEXT:    v_readlane_b32 s27, v6, 31
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v3
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 17
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 18
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 19
-; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 20
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 21
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 22
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 23
+; CHECK-NEXT:    v_readlane_b32 s13, v6, 17
+; CHECK-NEXT:    v_readlane_b32 s14, v6, 18
+; CHECK-NEXT:    v_readlane_b32 s15, v6, 19
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[20:27], s[44:47] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s16, v6, 20
+; CHECK-NEXT:    v_readlane_b32 s17, v6, 21
+; CHECK-NEXT:    v_readlane_b32 s18, v6, 22
+; CHECK-NEXT:    v_readlane_b32 s19, v6, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx3 v[2:4], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT:    v_readlane_b32 s52, v7, 40
-; CHECK-NEXT:    v_readlane_b32 s53, v7, 41
-; CHECK-NEXT:    v_readlane_b32 s54, v7, 42
-; CHECK-NEXT:    v_readlane_b32 s55, v7, 43
-; CHECK-NEXT:    v_readlane_b32 s56, v7, 44
-; CHECK-NEXT:    v_readlane_b32 s57, v7, 45
-; CHECK-NEXT:    v_readlane_b32 s58, v7, 46
-; CHECK-NEXT:    v_readlane_b32 s59, v7, 47
-; CHECK-NEXT:    v_readlane_b32 s60, v7, 48
-; CHECK-NEXT:    v_readlane_b32 s61, v7, 49
-; CHECK-NEXT:    v_readlane_b32 s62, v7, 50
-; CHECK-NEXT:    v_readlane_b32 s63, v7, 51
-; CHECK-NEXT:    v_readlane_b32 s64, v7, 52
-; CHECK-NEXT:    v_readlane_b32 s65, v7, 53
-; CHECK-NEXT:    v_readlane_b32 s66, v7, 54
-; CHECK-NEXT:    v_readlane_b32 s67, v7, 55
-; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s6
-; CHECK-NEXT:    v_readlane_b32 s36, v7, 56
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s7
-; CHECK-NEXT:    v_readlane_b32 s37, v7, 57
-; CHECK-NEXT:    v_readlane_b32 s38, v7, 58
-; CHECK-NEXT:    v_readlane_b32 s39, v7, 59
-; CHECK-NEXT:    v_readlane_b32 s40, v7, 60
-; CHECK-NEXT:    v_readlane_b32 s41, v7, 61
-; CHECK-NEXT:    v_readlane_b32 s42, v7, 62
-; CHECK-NEXT:    v_readlane_b32 s43, v7, 63
-; CHECK-NEXT:    s_nop 4
 ; CHECK-NEXT:    image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
 ; CHECK-NEXT:    image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
-; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v6, 0
-; CHECK-NEXT:    v_readlane_b32 s45, v6, 1
-; CHECK-NEXT:    v_readlane_b32 s46, v6, 2
-; CHECK-NEXT:    v_readlane_b32 s47, v6, 3
-; CHECK-NEXT:    v_readlane_b32 s48, v6, 4
-; CHECK-NEXT:    v_readlane_b32 s49, v6, 5
-; CHECK-NEXT:    v_readlane_b32 s50, v6, 6
-; CHECK-NEXT:    v_readlane_b32 s51, v6, 7
-; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
-; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_sub_f32_e32 v1, v4, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
@@ -333,45 +239,48 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT:    v_readlane_b32 s67, v5, 33
-; CHECK-NEXT:    v_readlane_b32 s66, v5, 32
-; CHECK-NEXT:    v_readlane_b32 s65, v5, 31
-; CHECK-NEXT:    v_readlane_b32 s64, v5, 30
-; CHECK-NEXT:    v_readlane_b32 s63, v5, 29
-; CHECK-NEXT:    v_readlane_b32 s62, v5, 28
-; CHECK-NEXT:    v_readlane_b32 s61, v5, 27
-; CHECK-NEXT:    v_readlane_b32 s60, v5, 26
-; CHECK-NEXT:    v_readlane_b32 s59, v5, 25
-; CHECK-NEXT:    v_readlane_b32 s58, v5, 24
-; CHECK-NEXT:    v_readlane_b32 s57, v5, 23
-; CHECK-NEXT:    v_readlane_b32 s56, v5, 22
-; CHECK-NEXT:    v_readlane_b32 s55, v5, 21
-; CHECK-NEXT:    v_readlane_b32 s54, v5, 20
-; CHECK-NEXT:    v_readlane_b32 s53, v5, 19
-; CHECK-NEXT:    v_readlane_b32 s52, v5, 18
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 17
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 16
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 15
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 7
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 2
+; CHECK-NEXT:    s_or_b64 exec, exec, s[28:29]
+; CHECK-NEXT:    v_readlane_b32 s69, v5, 37
+; CHECK-NEXT:    v_readlane_b32 s68, v5, 36
+; CHECK-NEXT:    v_readlane_b32 s67, v5, 35
+; CHECK-NEXT:    v_readlane_b32 s66, v5, 34
+; CHECK-NEXT:    v_readlane_b32 s65, v5, 33
+; CHECK-NEXT:    v_readlane_b32 s64, v5, 32
+; CHECK-NEXT:    v_readlane_b32 s63, v5, 31
+; CHECK-NEXT:    v_readlane_b32 s62, v5, 30
+; CHECK-NEXT:    v_readlane_b32 s61, v5, 29
+; CHECK-NEXT:    v_readlane_b32 s60, v5, 28
+; CHECK-NEXT:    v_readlane_b32 s59, v5, 27
+; CHECK-NEXT:    v_readlane_b32 s58, v5, 26
+; CHECK-NEXT:    v_readlane_b32 s57, v5, 25
+; CHECK-NEXT:    v_readlane_b32 s56, v5, 24
+; CHECK-NEXT:    v_readlane_b32 s55, v5, 23
+; CHECK-NEXT:    v_readlane_b32 s54, v5, 22
+; CHECK-NEXT:    v_readlane_b32 s53, v5, 21
+; CHECK-NEXT:    v_readlane_b32 s52, v5, 20
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 19
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 18
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 17
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 16
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 15
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v5, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v5, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v5, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v5, 0
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
index 215200c770245d..2646d13b4961d9 100644
--- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
@@ -30,35 +30,33 @@ body:             |
   ; CHECK-NEXT:   dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
-  ; CHECK-NEXT:   renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25
+  ; CHECK-NEXT:   renamable $sgpr52 = IMPLICIT_DEF implicit-def $sgpr53
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
   ; CHECK-NEXT:   $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.6, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.4, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr52_sgpr53_sgpr54_sgpr55, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr52_sgpr53_sgpr54_sgpr55, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   SI_RETURN
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr12 = IMPLICIT_DEF
-  ; CHECK-NEXT:   SI_SPILL_S512_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.0, align 4, addrspace 5)
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.0, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF2]], undef renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   S_BRANCH %bb.2
@@ -66,14 +64,12 @@ body:             |
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, undef renamable $sgpr52_sgpr53_sgpr54_sgpr55, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   S_BRANCH %bb.7
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.6:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, undef renamable $sgpr4_sgpr5, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.8, implicit $exec
@@ -83,7 +79,7 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY undef renamable $sgpr52, implicit-def $sgpr52
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
index b8818c5550ad44..0bfaa9e69547fc 100644
--- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
@@ -30,32 +30,31 @@ body:             |
   ; CHECK-NEXT:   dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead renamable $sgpr5 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
-  ; CHECK-NEXT:   renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25
+  ; CHECK-NEXT:   renamable $sgpr52 = IMPLICIT_DEF implicit-def $sgpr53
   ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
   ; CHECK-NEXT:   $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr52_sgpr53_sgpr54_sgpr55, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr52_sgpr53_sgpr54_sgpr55, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   SI_RETURN
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr12 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], undef renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   S_BRANCH %bb.2
@@ -63,14 +62,12 @@ body:             |
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF
-  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, undef renamable $sgpr52_sgpr53_sgpr54_sgpr55, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
   ; CHECK-NEXT:   S_BRANCH %bb.6
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT:   liveins: $sgpr52_sgpr53_sgpr54_sgpr55:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, undef renamable $sgpr4_sgpr5, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.7, implicit $exec
@@ -80,7 +77,7 @@ body:             |
   ; CHECK-NEXT:   liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
-  ; CHECK-NEXT:   renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY undef renamable $sgpr52, implicit-def $sgpr52
   ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc
   ; CHECK-NEXT:   S_BRANCH %bb.5
   ; CHECK-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/issue48473.mir b/llvm/test/CodeGen/AMDGPU/issue48473.mir
index 5c202d9928ab78..8794377fd56dca 100644
--- a/llvm/test/CodeGen/AMDGPU/issue48473.mir
+++ b/llvm/test/CodeGen/AMDGPU/issue48473.mir
@@ -43,7 +43,7 @@
 # %25 to $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 
 # CHECK-LABEL: name: issue48473
-# CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, implicit killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, implicit killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, implicit killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, implicit killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+# CHECK: S_NOP 0, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, implicit killed renamable $sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27, implicit killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, implicit killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, implicit killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, implicit killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, implicit killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, implicit killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, implicit killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, implicit killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, implicit killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, implicit killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, implicit killed renamable $sgpr76_sgpr77_sgpr78_sgpr79, implicit killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 
 ---
 name:            issue48473
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index c1ab63b8160c6a..2e2f0d297cb876 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -8744,7 +8744,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s74
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s72
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s70
-; GFX8-NEXT:    v_mov_b32_e32 v54, s68
+; GFX8-NEXT:    v_mov_b32_e32 v12, s68
 ; GFX8-NEXT:    v_mov_b32_e32 v20, s66
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s64
 ; GFX8-NEXT:    v_mov_b32_e32 v24, s62
@@ -8765,7 +8765,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s46
 ; GFX8-NEXT:    s_lshr_b32 s70, s2, 21
 ; GFX8-NEXT:    s_lshr_b32 s68, s2, 18
-; GFX8-NEXT:    v_mov_b32_e32 v56, s42
+; GFX8-NEXT:    v_mov_b32_e32 v14, s42
 ; GFX8-NEXT:    s_lshr_b32 s66, s2, 19
 ; GFX8-NEXT:    s_lshr_b32 s64, s2, 16
 ; GFX8-NEXT:    v_mov_b32_e32 v22, s40
@@ -8798,8 +8798,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s49
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s71
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s47
-; GFX8-NEXT:    v_mov_b32_e32 v55, s69
-; GFX8-NEXT:    v_mov_b32_e32 v57, s43
+; GFX8-NEXT:    v_mov_b32_e32 v13, s69
+; GFX8-NEXT:    v_mov_b32_e32 v15, s43
 ; GFX8-NEXT:    v_mov_b32_e32 v21, s67
 ; GFX8-NEXT:    v_mov_b32_e32 v23, s41
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s65
@@ -8860,39 +8860,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v44, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1d0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v47, s3
-; GFX8-NEXT:    v_mov_b32_e32 v46, s2
-; GFX8-NEXT:    s_add_u32 s2, s4, 0x1c0
-; GFX8-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v49, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v48, s2
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x1c0
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v53, s3
+; GFX8-NEXT:    v_mov_b32_e32 v52, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1b0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v51, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v50, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x1a0
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v53, s3
-; GFX8-NEXT:    v_mov_b32_e32 v52, s2
+; GFX8-NEXT:    v_mov_b32_e32 v55, s3
+; GFX8-NEXT:    v_mov_b32_e32 v54, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x190
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v15, s3
-; GFX8-NEXT:    v_mov_b32_e32 v14, s2
+; GFX8-NEXT:    v_mov_b32_e32 v57, s3
+; GFX8-NEXT:    v_mov_b32_e32 v56, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x180
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v13, s3
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
-; GFX8-NEXT:    buffer_store_dword v12, off, s[88:91], 0 ; 4-byte Folded Spill
-; GFX8-NEXT:    buffer_store_dword v13, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
+; GFX8-NEXT:    v_mov_b32_e32 v47, s3
+; GFX8-NEXT:    v_mov_b32_e32 v46, s2
+; GFX8-NEXT:    buffer_store_dword v46, off, s[88:91], 0 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v47, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
 ; GFX8-NEXT:    flat_store_dwordx4 v[42:43], v[4:7]
 ; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[0:3]
-; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[54:57]
+; GFX8-NEXT:    flat_store_dwordx4 v[48:49], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[12:15]
 ; GFX8-NEXT:    flat_store_dwordx4 v[50:51], v[20:23]
-; GFX8-NEXT:    flat_store_dwordx4 v[52:53], v[16:19]
-; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[24:27]
-; GFX8-NEXT:    buffer_load_dword v18, off, s[88:91], 0 ; 4-byte Folded Reload
-; GFX8-NEXT:    buffer_load_dword v19, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
+; GFX8-NEXT:    flat_store_dwordx4 v[54:55], v[16:19]
+; GFX8-NEXT:    flat_store_dwordx4 v[56:57], v[24:27]
+; GFX8-NEXT:    buffer_load_dword v16, off, s[88:91], 0 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v17, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x170
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v59, s3
@@ -8903,16 +8903,16 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v60, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x150
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v45, s3
-; GFX8-NEXT:    v_mov_b32_e32 v44, s2
+; GFX8-NEXT:    v_mov_b32_e32 v47, s3
+; GFX8-NEXT:    v_mov_b32_e32 v46, s2
 ; GFX8-NEXT:    s_add_u32 s2, s4, 0x140
 ; GFX8-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX8-NEXT:    s_add_u32 s0, s4, 0x130
 ; GFX8-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX8-NEXT:    s_addc_u32 s1, s5, 0
-; GFX8-NEXT:    v_mov_b32_e32 v17, s1
-; GFX8-NEXT:    v_mov_b32_e32 v16, s0
+; GFX8-NEXT:    v_mov_b32_e32 v13, s1
+; GFX8-NEXT:    v_mov_b32_e32 v12, s0
 ; GFX8-NEXT:    s_add_u32 s0, s4, 0x120
 ; GFX8-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v15, s1
@@ -8920,11 +8920,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s0, s4, 0x110
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s7
-; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    v_mov_b32_e32 v45, s3
 ; GFX8-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v42, vcc_lo
 ; GFX8-NEXT:    v_mov_b32_e32 v43, vcc_hi
-; GFX8-NEXT:    v_mov_b32_e32 v12, s2
+; GFX8-NEXT:    v_mov_b32_e32 v44, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s12
@@ -8934,12 +8934,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s15
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[28:31]
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[28:31]
 ; GFX8-NEXT:    flat_store_dwordx4 v[58:59], v[32:35]
 ; GFX8-NEXT:    flat_store_dwordx4 v[60:61], v[36:39]
-; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[40:43]
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[46:47], v[40:43]
+; GFX8-NEXT:    flat_store_dwordx4 v[44:45], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 64f1f45bf734cf..5c748c580136d7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7357,6 +7357,20 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(2)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 8f6a1f8c01ec34..1571ab62b0da05 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -3014,8 +3014,8 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:16
 ; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(7)
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v31
-; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v30
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v46, 31, v31
+; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v44, 31, v30
 ; SI-NOHSA-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v39, 31, v15
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v37, 31, v14
@@ -3029,16 +3029,17 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v33, 31, v28
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v32, v28
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v34, v29
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v30
-; SI-NOHSA-NEXT:    v_mov_b32_e32 v46, v31
-; SI-NOHSA-NEXT:    buffer_store_dword v44, off, s[12:15], 0 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v45, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v46, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    buffer_store_dword v47, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; SI-NOHSA-NEXT:    s_waitcnt vmcnt(9)
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v13, v44
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v15, v46
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v12, v30
+; SI-NOHSA-NEXT:    v_mov_b32_e32 v14, v31
+; SI-NOHSA-NEXT:    buffer_store_dword v12, off, s[12:15], 0 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(9) expcnt(0)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v15, 31, v7
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v13, 31, v6
-; SI-NOHSA-NEXT:    s_waitcnt expcnt(0)
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v47, 31, v5
 ; SI-NOHSA-NEXT:    v_ashrrev_i32_e32 v45, 31, v4
 ; SI-NOHSA-NEXT:    v_mov_b32_e32 v44, v4
diff --git a/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir b/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir
index d406f2932dc968..e70ed0693174b8 100644
--- a/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/ra-inserted-scalar-instructions.mir
@@ -138,17 +138,18 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; GCN-NEXT:   dead undef [[DEF1:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
-  ; GCN-NEXT:   SI_SPILL_S32_SAVE $sgpr1, %stack.15, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.15, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE $sgpr1, %stack.14, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.14, addrspace 5)
   ; GCN-NEXT:   undef [[COPY:%[0-9]+]].sub1:sgpr_64 = COPY $sgpr0
-  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[COPY]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
   ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
   ; GCN-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
   ; GCN-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]].sub1:sgpr_64 = IMPLICIT_DEF
   ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_1]], %stack.19, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.19, align 4, addrspace 5)
   ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_2:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
   ; GCN-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF]], implicit $exec
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_]], %stack.10, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.10, addrspace 5)
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
   ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_1]], %stack.17, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.17, addrspace 5)
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit undef $scc
@@ -164,110 +165,115 @@ body:             |
   ; GCN-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef [[V_READFIRSTLANE_B32_2]], 132, 0 :: ("amdgpu-noclobber" load (s128), align 8, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S128_SAVE [[S_LOAD_DWORDX4_IMM]], %stack.14, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.14, align 4, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S128_SAVE [[S_LOAD_DWORDX4_IMM]], %stack.20, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.20, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef [[V_READFIRSTLANE_B32_2]], 188, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM]], %stack.15, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.15, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit undef $scc
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   successors: %bb.4(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 -1
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 -1
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_2]], %stack.10, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.10, addrspace 5)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
   ; GCN-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_]], %stack.9, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.9, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY1:%[0-9]+]].sub1:sgpr_64 = COPY [[COPY]].sub1
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[COPY1]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM undef [[V_READFIRSTLANE_B32_2]], 120, 0 :: ("amdgpu-noclobber" load (s64), align 16, addrspace 1)
   ; GCN-NEXT:   SI_SPILL_S64_SAVE [[S_LOAD_DWORDX2_IMM]], %stack.18, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.18, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM1:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef [[V_READFIRSTLANE_B32_2]], 352, 0 :: ("amdgpu-noclobber" load (s256), align 16, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM1]], %stack.10, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.10, align 4, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM1]], %stack.11, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.11, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %97:sreg_64, 0, 0
-  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM]], %stack.11, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.11, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM]], %stack.12, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.12, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM2:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef [[V_READFIRSTLANE_B32_2]], 652, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM2]], %stack.6, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.6, align 4, addrspace 5)
-  ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM2]], %stack.7, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.7, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_MOV_B64_]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
   ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM1]], %stack.3, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.3, addrspace 5)
   ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_2]], %stack.1, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.1, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM3:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[V_READFIRSTLANE_B32_2]], 688, 0 :: ("amdgpu-noclobber" load (s256), align 16, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM3]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.4, align 4, addrspace 5)
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
-  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM3]], %stack.5, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.5, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_3]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.4, addrspace 5)
+  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 0
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit undef $scc
   ; GCN-NEXT:   S_BRANCH %bb.5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.5:
   ; GCN-NEXT:   successors: %bb.6(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 -1
+  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 -1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.6:
   ; GCN-NEXT:   successors: %bb.7(0x40000000), %bb.10(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_3]], %stack.5, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.5, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_4]], %stack.6, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.6, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM undef %123:sgpr_64, 0, 0 :: ("amdgpu-noclobber" load (s32), align 16, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM2]], %stack.22, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.22, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM4:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %124:sgpr_64, 152, 0 :: ("amdgpu-noclobber" load (s256), align 4, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM4]], %stack.20, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.20, align 4, addrspace 5)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM4]], %stack.21, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.21, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM5:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %125:sgpr_64, 220, 0 :: ("amdgpu-noclobber" load (s256), align 4, addrspace 1)
   ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM5]], %stack.16, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.16, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM6:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %126:sgpr_64, 384, 0 :: ("amdgpu-noclobber" load (s256), align 4, addrspace 1)
   ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM6]], %stack.13, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.13, align 4, addrspace 5)
   ; GCN-NEXT:   [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM undef %127:sgpr_64, 440, 0 :: ("amdgpu-noclobber" load (s512), align 8, addrspace 1)
   ; GCN-NEXT:   [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM undef %128:sgpr_64, 584, 0 :: ("amdgpu-noclobber" load (s512), align 16, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S512_SAVE [[S_LOAD_DWORDX16_IMM1]], %stack.12, implicit $exec, implicit $sgpr32 :: (store (s512) into %stack.12, align 4, addrspace 5)
-  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM7:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[V_READFIRSTLANE_B32_]], 156, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM7]], %stack.8, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.8, align 4, addrspace 5)
-  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.19, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.19, align 4, addrspace 5)
-  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM [[SI_SPILL_S64_RESTORE]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
-  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM3]], %stack.7, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.7, addrspace 5)
-  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[V_READFIRSTLANE_B32_]], %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sgpr_64 = COPY [[V_READFIRSTLANE_B32_]]
-  ; GCN-NEXT:   dead [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY1]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORDX8_IMM7:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[SI_SPILL_S64_RESTORE]], 156, 0 :: ("amdgpu-noclobber" load (s256), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S256_SAVE [[S_LOAD_DWORDX8_IMM7]], %stack.9, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.9, align 4, addrspace 5)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE1:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.19, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.19, align 4, addrspace 5)
+  ; GCN-NEXT:   [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sgpr_32 = S_LOAD_DWORD_IMM [[SI_SPILL_S64_RESTORE1]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_LOAD_DWORD_IMM3]], %stack.8, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.8, addrspace 5)
+  ; GCN-NEXT:   dead [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[SI_SPILL_S64_RESTORE]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1)
   ; GCN-NEXT:   [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
   ; GCN-NEXT:   [[S_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_MOV_B64_1]], 0, 0 :: ("amdgpu-noclobber" load (s32), addrspace 1)
-  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE1:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY2:%[0-9]+]].sub1:sgpr_64 = COPY [[SI_SPILL_S64_RESTORE1]].sub1
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE2:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY2:%[0-9]+]].sub1:sgpr_64 = COPY [[SI_SPILL_S64_RESTORE2]].sub1
   ; GCN-NEXT:   [[COPY2:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 1
+  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[COPY2]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.10, implicit undef $scc
   ; GCN-NEXT:   S_BRANCH %bb.7
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.7:
   ; GCN-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   SI_SPILL_S64_SAVE [[COPY2]], %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
   ; GCN-NEXT:   undef [[V_READFIRSTLANE_B32_3:%[0-9]+]].sub0:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF1]].sub0, implicit $exec
   ; GCN-NEXT:   dead [[V_READFIRSTLANE_B32_3:%[0-9]+]].sub1:sgpr_64 = V_READFIRSTLANE_B32 undef [[DEF1]].sub1, implicit $exec
   ; GCN-NEXT:   [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GCN-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_5]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.4, addrspace 5)
   ; GCN-NEXT:   $vcc = COPY [[DEF3]]
-  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.9, implicit $vcc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.9, implicit $vcc_lo
   ; GCN-NEXT:   S_BRANCH %bb.8
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.8:
   ; GCN-NEXT:   successors: %bb.9(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 -1
+  ; GCN-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sgpr_32 = S_MOV_B32 -1
+  ; GCN-NEXT:   SI_SPILL_S32_SAVE [[S_MOV_B32_6]], %stack.4, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.4, addrspace 5)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.9:
   ; GCN-NEXT:   successors: %bb.10(0x80000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = COPY [[S_MOV_B32_4]]
-  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   dead [[SI_SPILL_S64_RESTORE3:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.10:
   ; GCN-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORD_IMM2]], 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.17, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.17, addrspace 5)
-  ; GCN-NEXT:   dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_]], [[SI_SPILL_S32_RESTORE]], implicit-def dead $scc
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.15, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.15, addrspace 5)
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE1]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.22, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.22, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE1:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.17, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.17, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_]], [[SI_SPILL_S32_RESTORE1]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE2:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.14, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.14, addrspace 5)
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE2]], 0, implicit-def $scc
   ; GCN-NEXT:   dead [[DEF4:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE2:%[0-9]+]]:sreg_64_xexec = SI_SPILL_S64_RESTORE %stack.18, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.18, align 4, addrspace 5)
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S64_RESTORE2]].sub1, 0, implicit-def $scc
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE4:%[0-9]+]]:sreg_64_xexec = SI_SPILL_S64_RESTORE %stack.18, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.18, align 4, addrspace 5)
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S64_RESTORE4]].sub1, 0, implicit-def $scc
   ; GCN-NEXT:   dead [[DEF5:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.20, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.20, align 4, addrspace 5)
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.21, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.21, align 4, addrspace 5)
   ; GCN-NEXT:   undef [[COPY3:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub0 {
   ; GCN-NEXT:     internal [[COPY3]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub2
   ; GCN-NEXT:     internal [[COPY3]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE]].sub4
@@ -277,18 +283,20 @@ body:             |
   ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY3]].sub0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_2:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY3]].sub2, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_3:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY3]].sub4, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LOAD_DWORDX8_IMM]].sub0, undef [[S_OR_B32_]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_4:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub1, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_5:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub2, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_6:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub3, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_7:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub4, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_8:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub5, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_9:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX8_IMM]].sub6, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.14, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.14, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY4:%[0-9]+]].sub0_sub1_sub2:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0_sub1_sub2
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_10:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_11:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub1, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_12:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE1:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.15, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.15, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY4:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
+  ; GCN-NEXT:   dead [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY4]].sub0, undef [[S_OR_B32_]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_4:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_5:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_6:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_7:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_8:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_9:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY4]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S128_RESTORE:%[0-9]+]]:sgpr_128 = SI_SPILL_S128_RESTORE %stack.20, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.20, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY5:%[0-9]+]].sub0_sub1_sub2:sgpr_128 = COPY [[SI_SPILL_S128_RESTORE]].sub0_sub1_sub2
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_10:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_11:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_12:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub2, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DEF6:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN-NEXT:   dead [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF5]], [[DEF6]], implicit-def dead $scc
   ; GCN-NEXT:   dead [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
@@ -297,121 +305,105 @@ body:             |
   ; GCN-NEXT:   dead [[DEF10:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN-NEXT:   dead [[DEF11:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN-NEXT:   dead [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF11]], undef [[DEF11]], implicit-def dead $scc
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE1:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.16, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.16, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY5:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub0 {
-  ; GCN-NEXT:     internal [[COPY5]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub2
-  ; GCN-NEXT:     internal [[COPY5]].sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub5
-  ; GCN-NEXT:     internal [[COPY5]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE1]].sub7
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE2:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.16, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.16, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY6:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub0 {
+  ; GCN-NEXT:     internal [[COPY6]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub2
+  ; GCN-NEXT:     internal [[COPY6]].sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub5
+  ; GCN-NEXT:     internal [[COPY6]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].sub7
   ; GCN-NEXT:   }
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_13:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_13:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[S_AND_B32_3:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_8]], undef [[V_CMP_GT_F32_e64_9]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_14:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY5]].sub2, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY5]].sub5, [[COPY5]].sub7, implicit-def dead $scc
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE2:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.10, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.10, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY6:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE2]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
-  ; GCN-NEXT:   dead [[S_OR_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub0, [[COPY6]].sub1, implicit-def dead $scc
-  ; GCN-NEXT:   dead [[S_OR_B32_4:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub2, undef [[S_OR_B32_3]], implicit-def dead $scc
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE2:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.9, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.9, addrspace 5)
-  ; GCN-NEXT:   dead [[S_AND_B32_4:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_3]], [[SI_SPILL_S32_RESTORE2]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_15:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub3, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_16:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub4, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_17:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub5, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_18:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub6, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE3:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.11, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.11, addrspace 5)
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_19:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE3]], 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE3:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.13, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.13, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY7:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub0 {
-  ; GCN-NEXT:     internal [[COPY7]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub2
-  ; GCN-NEXT:     internal [[COPY7]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub4
-  ; GCN-NEXT:     internal [[COPY7]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].sub7
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_14:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY6]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_OR_B32_2:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY6]].sub5, [[COPY6]].sub7, implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE3:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.11, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.11, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY7:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE3]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
+  ; GCN-NEXT:   dead [[S_OR_B32_3:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY7]].sub0, [[COPY7]].sub1, implicit-def dead $scc
+  ; GCN-NEXT:   dead [[S_OR_B32_4:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY7]].sub2, undef [[S_OR_B32_3]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE3:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.10, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.10, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_4:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_3]], [[SI_SPILL_S32_RESTORE3]], implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_15:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_16:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_17:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_18:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE4:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.12, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.12, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_19:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE4]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE4:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.13, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.13, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY8:%[0-9]+]].sub0:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].sub0 {
+  ; GCN-NEXT:     internal [[COPY8]].sub2:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].sub2
+  ; GCN-NEXT:     internal [[COPY8]].sub4:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].sub4
+  ; GCN-NEXT:     internal [[COPY8]].sub7:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].sub7
   ; GCN-NEXT:   }
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_20:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_21:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_20:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_21:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub2, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[DEF12:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_22:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY7]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_22:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub4, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[S_AND_B32_5:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF12]], undef [[V_CMP_GT_F32_e64_20]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[COPY7]].sub7, 0, implicit-def $scc
-  ; GCN-NEXT:   undef [[COPY8:%[0-9]+]].sub0:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub0 {
-  ; GCN-NEXT:     internal [[COPY8]].sub2:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub2
-  ; GCN-NEXT:     internal [[COPY8]].sub4:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub4
-  ; GCN-NEXT:     internal [[COPY8]].sub6:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub6
-  ; GCN-NEXT:     internal [[COPY8]].sub9:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub9
-  ; GCN-NEXT:     internal [[COPY8]].sub10:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub10
-  ; GCN-NEXT:     internal [[COPY8]].sub13:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub13
-  ; GCN-NEXT:     internal [[COPY8]].sub14:sgpr_512 = COPY [[S_LOAD_DWORDX16_IMM]].sub14
-  ; GCN-NEXT:   }
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_23:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_24:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub2, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_25:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub4, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_26:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[COPY8]].sub7, 0, implicit-def $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_23:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_24:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_25:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_26:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM]].sub6, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[S_AND_B32_6:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[V_CMP_GT_F32_e64_23]], undef [[V_CMP_GT_F32_e64_23]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[S_OR_B32_5:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY8]].sub10, [[COPY8]].sub9, implicit-def dead $scc
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_27:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub13, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_28:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY8]].sub14, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S512_RESTORE:%[0-9]+]]:sgpr_512 = SI_SPILL_S512_RESTORE %stack.12, implicit $exec, implicit $sgpr32 :: (load (s512) from %stack.12, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY9:%[0-9]+]].sub1:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub1 {
-  ; GCN-NEXT:     internal [[COPY9]].sub5:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub5
-  ; GCN-NEXT:     internal [[COPY9]].sub6:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub6
-  ; GCN-NEXT:     internal [[COPY9]].sub9:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub9
-  ; GCN-NEXT:     internal [[COPY9]].sub10:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub10
-  ; GCN-NEXT:     internal [[COPY9]].sub12:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub12
-  ; GCN-NEXT:     internal [[COPY9]].sub15:sgpr_512 = COPY [[SI_SPILL_S512_RESTORE]].sub15
-  ; GCN-NEXT:   }
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[COPY9]].sub1, 0, implicit-def $scc
+  ; GCN-NEXT:   dead [[S_OR_B32_5:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_LOAD_DWORDX16_IMM]].sub10, [[S_LOAD_DWORDX16_IMM]].sub9, implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_27:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM]].sub13, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_28:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM]].sub14, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[S_LOAD_DWORDX16_IMM1]].sub1, 0, implicit-def $scc
   ; GCN-NEXT:   dead [[DEF13:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_29:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub5, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_30:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_29:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM1]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_30:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM1]].sub6, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[DEF14:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_31:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub9, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_32:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub10, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_31:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM1]].sub9, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_32:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM1]].sub10, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[DEF15:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN-NEXT:   dead [[S_AND_B32_7:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF15]], undef [[DEF14]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_33:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub12, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE4:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.6, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.6, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY10:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE4]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
-  ; GCN-NEXT:   dead [[S_OR_B32_6:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY10]].sub0, [[COPY9]].sub15, implicit-def dead $scc
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_33:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[S_LOAD_DWORDX16_IMM1]].sub12, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE5:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.7, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.7, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY9:%[0-9]+]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE5]].lo16_hi16_sub1_lo16_sub1_hi16_sub2_lo16_sub2_hi16_sub3_lo16_sub3_hi16_sub4_lo16_sub4_hi16_sub5_lo16_sub5_hi16_sub6_lo16_sub6_hi16
+  ; GCN-NEXT:   dead [[S_OR_B32_6:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY9]].sub0, [[S_LOAD_DWORDX16_IMM1]].sub15, implicit-def dead $scc
   ; GCN-NEXT:   dead [[DEF16:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_34:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub1, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_35:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_34:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_35:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub2, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[DEF17:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_36:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub3, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_37:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub4, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_36:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_37:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub4, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[DEF18:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_38:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub5, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_39:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub6, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_38:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub5, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_39:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY9]].sub6, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   dead [[S_AND_B32_8:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[DEF18]], undef [[DEF17]], implicit-def dead $scc
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE5:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.4, align 4, addrspace 5)
-  ; GCN-NEXT:   undef [[COPY11:%[0-9]+]].sub0_sub1_sub2_sub3_sub4_sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE5]].sub0_sub1_sub2_sub3_sub4_sub5
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_40:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_41:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub1, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE4:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_42:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE4]], 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_43:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub2, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_44:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY11]].sub3, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   dead [[S_OR_B32_7:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY11]].sub4, [[COPY11]].sub5, implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE4]], 0, implicit-def $scc
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE5:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.5, addrspace 5)
-  ; GCN-NEXT:   dead [[S_AND_B32_9:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_7]], [[SI_SPILL_S32_RESTORE5]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[S_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY2]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
-  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE6:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.8, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.8, align 4, addrspace 5)
-  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S256_RESTORE6]].sub7, 0, implicit-def $scc
-  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE6:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.7, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.7, addrspace 5)
-  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_45:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE6]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE6:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.5, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.5, align 4, addrspace 5)
+  ; GCN-NEXT:   undef [[COPY10:%[0-9]+]].sub0_sub1_sub2_sub3_sub4_sub5:sgpr_256 = COPY [[SI_SPILL_S256_RESTORE6]].sub0_sub1_sub2_sub3_sub4_sub5
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_40:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub0, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_41:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub1, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE5:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.3, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_42:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE5]], 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_43:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub2, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_44:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[COPY10]].sub3, 0, implicit $mode, implicit $exec
+  ; GCN-NEXT:   dead [[S_OR_B32_7:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY10]].sub4, [[COPY10]].sub5, implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S32_RESTORE5]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE6:%[0-9]+]]:sreg_32_xm0_xexec = SI_SPILL_S32_RESTORE %stack.6, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.6, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_9:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_OR_B32_7]], [[SI_SPILL_S32_RESTORE6]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE5:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
+  ; GCN-NEXT:   dead [[S_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[SI_SPILL_S64_RESTORE5]], 0, 0 :: ("amdgpu-noclobber" load (s32), align 8, addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S256_RESTORE7:%[0-9]+]]:sgpr_256 = SI_SPILL_S256_RESTORE %stack.9, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.9, align 4, addrspace 5)
+  ; GCN-NEXT:   S_CMP_EQ_U32 [[SI_SPILL_S256_RESTORE7]].sub7, 0, implicit-def $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE7:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.8, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.8, addrspace 5)
+  ; GCN-NEXT:   dead [[V_CMP_GT_F32_e64_45:%[0-9]+]]:sreg_32 = V_CMP_GT_F32_e64 0, 0, 0, [[SI_SPILL_S32_RESTORE7]], 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   [[DEF19:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN-NEXT:   dead [[S_AND_B32_10:%[0-9]+]]:sreg_32 = S_AND_B32 [[DEF19]], undef [[S_LOAD_DWORD_IMM6]], implicit-def dead $scc
-  ; GCN-NEXT:   dead [[S_AND_B32_11:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_AND_B32_10]], [[S_MOV_B32_2]], implicit-def dead $scc
+  ; GCN-NEXT:   [[SI_SPILL_S32_RESTORE8:%[0-9]+]]:sgpr_32 = SI_SPILL_S32_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.4, addrspace 5)
+  ; GCN-NEXT:   dead [[S_AND_B32_11:%[0-9]+]]:sreg_32 = S_AND_B32 undef [[S_AND_B32_10]], [[SI_SPILL_S32_RESTORE8]], implicit-def dead $scc
   ; GCN-NEXT:   $vcc = COPY undef [[S_AND_B32_11]]
-  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.12, implicit $vcc
+  ; GCN-NEXT:   S_CBRANCH_VCCNZ %bb.12, implicit $vcc_lo
   ; GCN-NEXT:   S_BRANCH %bb.11
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.11:
   ; GCN-NEXT:   successors: %bb.12(0x80000000)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.12:
-  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE3:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE3]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
-  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE4:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE4]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE6:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.1, align 4, addrspace 5)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE6]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; GCN-NEXT:   [[SI_SPILL_S64_RESTORE7:%[0-9]+]]:sgpr_64 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], [[SI_SPILL_S64_RESTORE7]], 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1, %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir b/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir
index 09be927dc952e2..ffbfbf82e4d055 100644
--- a/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir
+++ b/llvm/test/CodeGen/AMDGPU/regalloc-fail-unsatisfiable-overlapping-tuple-hints.mir
@@ -42,27 +42,27 @@ body:             |
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit-def %7, implicit-def %19, implicit-def %5
-  ; CHECK-NEXT:   SI_SPILL_V256_SAVE %19, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit-def %7, implicit-def %15, implicit-def %5
+  ; CHECK-NEXT:   SI_SPILL_V256_SAVE %15, %stack.2, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   SI_SPILL_V256_SAVE %7, %stack.1, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.1, align 4, addrspace 5)
   ; CHECK-NEXT:   SI_SPILL_V256_SAVE %5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit-def %17
-  ; CHECK-NEXT:   SI_SPILL_V256_SAVE %17, %stack.2, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V256_SAVE %17, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit-def %4
   ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.1, align 4, addrspace 5)
-  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE1:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE1:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V256_RESTORE]], implicit [[SI_SPILL_V256_RESTORE1]], implicit %4
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY [[SI_SPILL_V256_RESTORE1]]
+  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE2:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.2, align 4, addrspace 5)
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_NOP 0, implicit [[COPY]]
-  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE2:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V256_RESTORE2]]
-  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE3:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.2, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE3:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5)
   ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V256_RESTORE3]]
+  ; CHECK-NEXT:   [[SI_SPILL_V256_RESTORE4:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_NOP 0, implicit [[SI_SPILL_V256_RESTORE4]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
index 95eac12a65389e..95ecc4541d0769 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
@@ -264,14 +264,14 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr0 = S_GET_WAVEID_IN_WORKGROUP
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = S_GET_WAVEID_IN_WORKGROUP
     ; GCN-NEXT: renamable $sgpr0 = S_GET_WAVEID_IN_WORKGROUP
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -295,13 +295,21 @@ body:             |
     ; GCN: liveins: $sgpr10, $sgpr8_sgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr0 = COPY $sgpr10
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE $sgpr10, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, killed renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, killed renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, killed renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3, implicit killed renamable $sgpr0
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
     %1:sgpr_32 = COPY $sgpr10
@@ -324,13 +332,21 @@ body:             |
     ; GCN: liveins: $sgpr10, $sgpr8_sgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr0 = COPY $sgpr10
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE $sgpr10, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, killed renamable $sgpr0, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, killed renamable $sgpr0, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, killed renamable $sgpr0, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
+    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
+    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3, implicit killed renamable $sgpr0
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
     %1:sgpr_32 = COPY $sgpr10
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 0ad10437299f48..d954181b062129 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -61,10 +61,10 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    v_writelane_b32 v22, s18, 28
 ; CHECK-NEXT:    v_writelane_b32 v22, s19, 29
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[42:43]
+; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[52:55]
+; CHECK-NEXT:    ; def s[36:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
@@ -83,218 +83,220 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    ; def s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[40:41]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[36:39]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[44:51]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:15]
+; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v22, s0, 38
 ; CHECK-NEXT:    v_writelane_b32 v22, s1, 39
-; CHECK-NEXT:    v_writelane_b32 v22, s2, 40
-; CHECK-NEXT:    v_writelane_b32 v22, s3, 41
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 42
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 43
-; CHECK-NEXT:    v_writelane_b32 v22, s6, 44
-; CHECK-NEXT:    v_writelane_b32 v22, s7, 45
-; CHECK-NEXT:    v_writelane_b32 v22, s8, 46
-; CHECK-NEXT:    v_writelane_b32 v22, s9, 47
-; CHECK-NEXT:    v_writelane_b32 v22, s10, 48
-; CHECK-NEXT:    v_writelane_b32 v22, s11, 49
-; CHECK-NEXT:    v_writelane_b32 v22, s12, 50
-; CHECK-NEXT:    v_writelane_b32 v22, s13, 51
-; CHECK-NEXT:    v_writelane_b32 v22, s14, 52
-; CHECK-NEXT:    v_writelane_b32 v22, s15, 53
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[34:35]
-; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s0, 54
-; CHECK-NEXT:    v_writelane_b32 v22, s1, 55
-; CHECK-NEXT:    v_writelane_b32 v22, s2, 56
-; CHECK-NEXT:    v_writelane_b32 v22, s3, 57
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 40
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 41
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 42
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 43
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v22, s0, 58
-; CHECK-NEXT:    v_writelane_b32 v22, s1, 59
-; CHECK-NEXT:    v_writelane_b32 v22, s2, 60
-; CHECK-NEXT:    ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
-; CHECK-NEXT:    v_writelane_b32 v22, s3, 61
-; CHECK-NEXT:    v_writelane_b32 v22, s4, 62
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 0
-; CHECK-NEXT:    v_writelane_b32 v22, s5, 63
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 1
+; CHECK-NEXT:    v_writelane_b32 v22, s0, 44
+; CHECK-NEXT:    v_writelane_b32 v22, s1, 45
+; CHECK-NEXT:    v_writelane_b32 v22, s2, 46
+; CHECK-NEXT:    v_writelane_b32 v22, s3, 47
+; CHECK-NEXT:    v_writelane_b32 v22, s4, 48
+; CHECK-NEXT:    v_writelane_b32 v22, s5, 49
+; CHECK-NEXT:    v_writelane_b32 v22, s6, 50
+; CHECK-NEXT:    v_writelane_b32 v22, s7, 51
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 2
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 3
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 5
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 17
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:1]
+; CHECK-NEXT:    ; def s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 19
+; CHECK-NEXT:    v_writelane_b32 v22, s40, 52
+; CHECK-NEXT:    v_writelane_b32 v22, s41, 53
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:3]
+; CHECK-NEXT:    ; def s[40:43]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 23
+; CHECK-NEXT:    v_writelane_b32 v22, s40, 54
+; CHECK-NEXT:    v_writelane_b32 v22, s41, 55
+; CHECK-NEXT:    v_writelane_b32 v22, s42, 56
+; CHECK-NEXT:    v_writelane_b32 v22, s43, 57
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ; def s[40:47]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 25
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 29
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 31
+; CHECK-NEXT:    v_writelane_b32 v22, s40, 58
+; CHECK-NEXT:    v_writelane_b32 v22, s41, 59
+; CHECK-NEXT:    v_writelane_b32 v22, s42, 60
+; CHECK-NEXT:    ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
+; CHECK-NEXT:    v_writelane_b32 v22, s43, 61
+; CHECK-NEXT:    v_writelane_b32 v22, s44, 62
+; CHECK-NEXT:    v_writelane_b32 v23, s46, 0
+; CHECK-NEXT:    v_writelane_b32 v22, s45, 63
+; CHECK-NEXT:    v_writelane_b32 v23, s47, 1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[40:55]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s40, 2
+; CHECK-NEXT:    v_writelane_b32 v23, s41, 3
+; CHECK-NEXT:    v_writelane_b32 v23, s42, 4
+; CHECK-NEXT:    v_writelane_b32 v23, s43, 5
+; CHECK-NEXT:    v_writelane_b32 v23, s44, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s45, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s46, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s47, 9
+; CHECK-NEXT:    v_writelane_b32 v23, s48, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s49, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s50, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s51, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s52, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s53, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s54, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s55, 17
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:15]
+; CHECK-NEXT:    ; def s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 33
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s40, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s41, 19
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[40:43]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s40, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s41, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s42, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s43, 23
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[40:47]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s40, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s41, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s42, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s43, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s44, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s45, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s46, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s47, 31
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[40:55]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s40, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s41, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s42, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s43, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s44, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s45, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s46, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s47, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s48, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s49, 41
+; CHECK-NEXT:    v_writelane_b32 v23, s50, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s51, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s52, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s53, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s54, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s55, 47
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
 ; CHECK-NEXT:    s_endpgm
 ; CHECK-NEXT:  .LBB0_2: ; %bb0
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 0
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 1
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:1]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 5
+; CHECK-NEXT:    v_readlane_b32 s40, v22, 0
+; CHECK-NEXT:    v_readlane_b32 s41, v22, 1
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 13
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 14
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 15
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 16
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 17
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 18
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 19
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 20
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 21
-; CHECK-NEXT:    v_readlane_b32 s8, v22, 22
-; CHECK-NEXT:    v_readlane_b32 s9, v22, 23
-; CHECK-NEXT:    v_readlane_b32 s10, v22, 24
-; CHECK-NEXT:    v_readlane_b32 s11, v22, 25
-; CHECK-NEXT:    v_readlane_b32 s12, v22, 26
-; CHECK-NEXT:    v_readlane_b32 s13, v22, 27
-; CHECK-NEXT:    v_readlane_b32 s14, v22, 28
-; CHECK-NEXT:    v_readlane_b32 s15, v22, 29
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:15]
+; CHECK-NEXT:    ; use s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 30
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 31
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 32
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 33
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 34
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 35
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 36
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 37
+; CHECK-NEXT:    v_readlane_b32 s40, v22, 2
+; CHECK-NEXT:    v_readlane_b32 s41, v22, 3
+; CHECK-NEXT:    v_readlane_b32 s42, v22, 4
+; CHECK-NEXT:    v_readlane_b32 s43, v22, 5
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[40:43]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s40, v22, 6
+; CHECK-NEXT:    v_readlane_b32 s41, v22, 7
+; CHECK-NEXT:    v_readlane_b32 s42, v22, 8
+; CHECK-NEXT:    v_readlane_b32 s43, v22, 9
+; CHECK-NEXT:    v_readlane_b32 s44, v22, 10
+; CHECK-NEXT:    v_readlane_b32 s45, v22, 11
+; CHECK-NEXT:    v_readlane_b32 s46, v22, 12
+; CHECK-NEXT:    v_readlane_b32 s47, v22, 13
+; CHECK-NEXT:    s_mov_b64 s[54:55], s[38:39]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[40:47]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_mov_b64 s[52:53], s[36:37]
+; CHECK-NEXT:    v_readlane_b32 s36, v22, 14
+; CHECK-NEXT:    v_readlane_b32 s37, v22, 15
+; CHECK-NEXT:    v_readlane_b32 s38, v22, 16
+; CHECK-NEXT:    v_readlane_b32 s39, v22, 17
+; CHECK-NEXT:    v_readlane_b32 s40, v22, 18
+; CHECK-NEXT:    v_readlane_b32 s41, v22, 19
+; CHECK-NEXT:    v_readlane_b32 s42, v22, 20
+; CHECK-NEXT:    v_readlane_b32 s43, v22, 21
+; CHECK-NEXT:    v_readlane_b32 s44, v22, 22
+; CHECK-NEXT:    v_readlane_b32 s45, v22, 23
+; CHECK-NEXT:    v_readlane_b32 s46, v22, 24
+; CHECK-NEXT:    v_readlane_b32 s47, v22, 25
+; CHECK-NEXT:    v_readlane_b32 s48, v22, 26
+; CHECK-NEXT:    v_readlane_b32 s49, v22, 27
+; CHECK-NEXT:    v_readlane_b32 s50, v22, 28
+; CHECK-NEXT:    v_readlane_b32 s51, v22, 29
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[36:51]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s36, v22, 30
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[42:43]
+; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[52:55]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s37, v22, 31
+; CHECK-NEXT:    v_readlane_b32 s38, v22, 32
+; CHECK-NEXT:    v_readlane_b32 s39, v22, 33
+; CHECK-NEXT:    v_readlane_b32 s40, v22, 34
+; CHECK-NEXT:    v_readlane_b32 s41, v22, 35
+; CHECK-NEXT:    v_readlane_b32 s42, v22, 36
+; CHECK-NEXT:    v_readlane_b32 s43, v22, 37
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
+; CHECK-NEXT:    ; use s[36:43]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v22, 38
-; CHECK-NEXT:    v_readlane_b32 s1, v22, 39
-; CHECK-NEXT:    v_readlane_b32 s2, v22, 40
-; CHECK-NEXT:    v_readlane_b32 s3, v22, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s16, v22, 38
+; CHECK-NEXT:    v_readlane_b32 s17, v22, 39
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[40:41]
+; CHECK-NEXT:    ; use s[16:17]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s16, v22, 40
+; CHECK-NEXT:    v_readlane_b32 s17, v22, 41
+; CHECK-NEXT:    v_readlane_b32 s18, v22, 42
+; CHECK-NEXT:    v_readlane_b32 s19, v22, 43
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[36:39]
+; CHECK-NEXT:    ; use s[16:19]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s16, v22, 44
+; CHECK-NEXT:    v_readlane_b32 s17, v22, 45
+; CHECK-NEXT:    v_readlane_b32 s18, v22, 46
+; CHECK-NEXT:    v_readlane_b32 s19, v22, 47
+; CHECK-NEXT:    v_readlane_b32 s20, v22, 48
+; CHECK-NEXT:    v_readlane_b32 s21, v22, 49
+; CHECK-NEXT:    v_readlane_b32 s22, v22, 50
+; CHECK-NEXT:    v_readlane_b32 s23, v22, 51
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[44:51]
+; CHECK-NEXT:    ; use s[16:23]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s4, v22, 42
-; CHECK-NEXT:    v_readlane_b32 s5, v22, 43
-; CHECK-NEXT:    v_readlane_b32 s6, v22, 44
-; CHECK-NEXT:    v_readlane_b32 s7, v22, 45
-; CHECK-NEXT:    v_readlane_b32 s8, v22, 46
-; CHECK-NEXT:    v_readlane_b32 s9, v22, 47
-; CHECK-NEXT:    v_readlane_b32 s10, v22, 48
-; CHECK-NEXT:    v_readlane_b32 s11, v22, 49
-; CHECK-NEXT:    v_readlane_b32 s12, v22, 50
-; CHECK-NEXT:    v_readlane_b32 s13, v22, 51
-; CHECK-NEXT:    v_readlane_b32 s14, v22, 52
-; CHECK-NEXT:    v_readlane_b32 s15, v22, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s0, v22, 52
+; CHECK-NEXT:    v_readlane_b32 s1, v22, 53
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:1]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_readlane_b32 s0, v22, 54
 ; CHECK-NEXT:    v_readlane_b32 s1, v22, 55
 ; CHECK-NEXT:    v_readlane_b32 s2, v22, 56
 ; CHECK-NEXT:    v_readlane_b32 s3, v22, 57
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[34:35]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_readlane_b32 s0, v22, 58
diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index 080bd052a7391e..45c76e7155685e 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -36,54 +36,53 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr41 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = COPY undef $sgpr8_sgpr9
-  ; CHECK-NEXT:   renamable $sgpr36_sgpr37 = IMPLICIT_DEF
-  ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
-  ; CHECK-NEXT:   dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr57 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr54_sgpr55 = COPY undef $sgpr8_sgpr9
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr54_sgpr55, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
+  ; CHECK-NEXT:   dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr54_sgpr55, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM renamable $sgpr54_sgpr55, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   $vgpr1 = COPY renamable $sgpr51
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit undef $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $vcc = COPY renamable $sgpr40_sgpr41
+  ; CHECK-NEXT:   $vcc = COPY renamable $sgpr56_sgpr57
   ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43:0x00000000000003FF, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
-  ; CHECK-NEXT:   S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc
+  ; CHECK-NEXT:   S_CMP_LG_U64 renamable $sgpr36_sgpr37, 0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.5, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc
+  ; CHECK-NEXT:   S_CMP_EQ_U32 renamable $sgpr40, 0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
+  ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr54_sgpr55, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], killed renamable $sgpr38_sgpr39, 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef [[DEF]], undef [[DEF]], renamable $sgpr50_sgpr51, 0, 0, implicit $exec :: (store (s32), addrspace 1)
   ; CHECK-NEXT:   dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr49
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr52_sgpr53
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index ae70abc7317c31..6fffb4505f96b5 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -9758,11 +9758,11 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_add_u32 s40, s40, s11
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
-; GFX6-NEXT:    s_mov_b64 s[4:5], exec
-; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
@@ -9940,7 +9940,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[8:15]
+; GFX6-NEXT:    ; def s[4:11]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[16:23]
@@ -9952,39 +9952,51 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ; def s[0:3]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[4:5]
+; GFX6-NEXT:    ; def s[12:13]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s33
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX6-NEXT:    s_mov_b64 vcc, s[6:7]
+; GFX6-NEXT:    s_and_saveexec_b64 s[14:15], vcc
+; GFX6-NEXT:    s_mov_b64 vcc, s[14:15]
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 s[14:15], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s8, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s9, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s10, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s11, 3
-; GFX6-NEXT:    v_writelane_b32 v4, s12, 4
-; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
-; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
-; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
+; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s8, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
 ; GFX6-NEXT:    s_mov_b32 s34, 0x81400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
+; GFX6-NEXT:    s_mov_b64 exec, 3
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_writelane_b32 v4, s12, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s13, 1
+; GFX6-NEXT:    s_mov_b32 s6, 0x83000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s6 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s6, 0x80c00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
@@ -9996,8 +10008,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10009,18 +10021,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s6, 0x81c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s6 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x81400
+; GFX6-NEXT:    s_mov_b32 s6, 0x81400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
@@ -10032,8 +10044,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10045,31 +10057,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s34, 0x82400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
-; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s34, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s6, 0x82400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s6 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s26, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s27, v4, 3
-; GFX6-NEXT:    v_readlane_b32 s28, v4, 4
-; GFX6-NEXT:    v_readlane_b32 s29, v4, 5
-; GFX6-NEXT:    v_readlane_b32 s30, v4, 6
-; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[6:7], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10077,30 +10071,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s34, 0x82c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s6, 0x82c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s6 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
-; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[34:35], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[24:25], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_mov_b32 s36, 0x82400
+; GFX6-NEXT:    s_mov_b32 s26, 0x81c00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s26 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
@@ -10112,6 +10094,24 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[24:25]
+; GFX6-NEXT:    s_mov_b64 s[34:35], exec
+; GFX6-NEXT:    s_mov_b64 exec, 0xff
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s36, 0x82400
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s26, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s27, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s28, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s29, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s30, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
@@ -10140,7 +10140,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35]
+; GFX6-NEXT:    ; use s[8:15],s[16:23],s[0:7],s[24:31],s[36:39],s[34:35]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
index a9d687b78efa8c..ade98a5e10d994 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
@@ -52,10 +52,7 @@ define amdgpu_kernel void @max_11_vgprs_used_9a(ptr addrspace(1) %p) #0 {
 }
 
 ; GFX908-LABEL: {{^}}max_11_vgprs_used_1a_partial_spill:
-; GFX908-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GFX908-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GFX908-DAG: v_accvgpr_write_b32 a0, 1
-; GFX908-DAG:    buffer_store_dword v{{[0-9]}},
 ; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
 ; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
 ; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
@@ -65,8 +62,6 @@ define amdgpu_kernel void @max_11_vgprs_used_9a(ptr addrspace(1) %p) #0 {
 ; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
 ; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
 ; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
-; GFX908-DAG: v_accvgpr_write_b32 a10, v{{[0-9]}}
-; GFX908-DAG:    buffer_load_dword v{{[0-9]}},
 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a0
 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
@@ -77,12 +72,11 @@ define amdgpu_kernel void @max_11_vgprs_used_9a(ptr addrspace(1) %p) #0 {
 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
 ; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
-; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a10
 
 ; GFX908: NumVgprs: 10
-; GFX908: ScratchSize: 12
+; GFX908: ScratchSize: 0
 ; GFX908: VGPRBlocks: 2
-; GFX908: NumVGPRsForWavesPerEU: 11
+; GFX908: NumVGPRsForWavesPerEU: 10
 define amdgpu_kernel void @max_11_vgprs_used_1a_partial_spill(ptr addrspace(1) %p) #0 {
   %tid = load volatile i32, ptr addrspace(1) undef
   call void asm sideeffect "", "a"(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
index 42db92b15acf50..4bac6676db147e 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -52,15 +52,16 @@ body:             |
     ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_18:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
     ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_19:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 80, 0, 0, implicit $exec :: (load (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 96, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[COPY1]], 0, 112, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
     ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
     ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_]].sub2
     ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
@@ -120,26 +121,23 @@ body:             |
     ; CHECK-NEXT: undef [[COPY21:%[0-9]+]].sub0:vreg_128 = COPY [[COPY20]].sub0 {
     ; CHECK-NEXT:   internal [[COPY21]].sub2:vreg_128 = COPY [[COPY20]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_10:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: undef [[COPY22:%[0-9]+]].sub0:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY22]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub2
-    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY22:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_10]].sub2
+    ; CHECK-NEXT: [[COPY22:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
     ; CHECK-NEXT: undef [[COPY23:%[0-9]+]].sub0:vreg_128 = COPY [[COPY22]].sub0 {
     ; CHECK-NEXT:   internal [[COPY23]].sub2:vreg_128 = COPY [[COPY22]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY23]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY23]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
     ; CHECK-NEXT: undef [[COPY24:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_11]].sub2
     ; CHECK-NEXT: [[COPY24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
     ; CHECK-NEXT: undef [[COPY25:%[0-9]+]].sub0:vreg_128 = COPY [[COPY24]].sub0 {
     ; CHECK-NEXT:   internal [[COPY25]].sub2:vreg_128 = COPY [[COPY24]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY25]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY25]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
     ; CHECK-NEXT: undef [[COPY26:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_12]].sub2
     ; CHECK-NEXT: [[COPY26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
     ; CHECK-NEXT: undef [[COPY27:%[0-9]+]].sub0:vreg_128 = COPY [[COPY26]].sub0 {
     ; CHECK-NEXT:   internal [[COPY27]].sub2:vreg_128 = COPY [[COPY26]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY27]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
     ; CHECK-NEXT: undef [[COPY28:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_13]].sub2
     ; CHECK-NEXT: [[COPY28:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
     ; CHECK-NEXT: undef [[COPY29:%[0-9]+]].sub0:vreg_128 = COPY [[COPY28]].sub0 {
@@ -150,7 +148,7 @@ body:             |
     ; CHECK-NEXT: undef [[COPY31:%[0-9]+]].sub0:vreg_128 = COPY [[COPY30]].sub0 {
     ; CHECK-NEXT:   internal [[COPY31]].sub2:vreg_128 = COPY [[COPY30]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY31]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[COPY31]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
     ; CHECK-NEXT: undef [[COPY32:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_15]].sub2
     ; CHECK-NEXT: [[COPY32:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
     ; CHECK-NEXT: undef [[COPY33:%[0-9]+]].sub0:vreg_128 = COPY [[COPY32]].sub0 {
@@ -176,20 +174,19 @@ body:             |
     ; CHECK-NEXT: undef [[COPY41:%[0-9]+]].sub0:vreg_128 = COPY [[COPY40]].sub0 {
     ; CHECK-NEXT:   internal [[COPY41]].sub2:vreg_128 = COPY [[COPY40]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef [[COPY42:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub2
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_28:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY42:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_28]].sub2
     ; CHECK-NEXT: [[COPY42:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
     ; CHECK-NEXT: undef [[COPY43:%[0-9]+]].sub0:vreg_128 = COPY [[COPY42]].sub0 {
     ; CHECK-NEXT:   internal [[COPY43]].sub2:vreg_128 = COPY [[COPY42]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_28:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef [[COPY44:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_28]].sub2
+    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_29:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef [[COPY44:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_29]].sub2
     ; CHECK-NEXT: [[COPY44:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
     ; CHECK-NEXT: undef [[COPY45:%[0-9]+]].sub0:vreg_128 = COPY [[COPY44]].sub0 {
     ; CHECK-NEXT:   internal [[COPY45]].sub2:vreg_128 = COPY [[COPY44]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_29:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef [[COPY46:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_29]].sub2
+    ; CHECK-NEXT: undef [[COPY46:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_20]].sub2
     ; CHECK-NEXT: [[COPY46:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
     ; CHECK-NEXT: undef [[COPY47:%[0-9]+]].sub0:vreg_128 = COPY [[COPY46]].sub0 {
     ; CHECK-NEXT:   internal [[COPY47]].sub2:vreg_128 = COPY [[COPY46]].sub2
@@ -197,268 +194,279 @@ body:             |
     ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_30:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
     ; CHECK-NEXT: undef [[COPY48:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_30]].sub2
     ; CHECK-NEXT: [[COPY48:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef [[COPY49:%[0-9]+]].sub0:vreg_128 = COPY [[COPY48]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY49]].sub2:vreg_128 = COPY [[COPY48]].sub2
+    ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[V_LSHRREV_B32_e32_31:%[0-9]+]].sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
-    ; CHECK-NEXT: undef [[COPY49:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_31]].sub2
-    ; CHECK-NEXT: [[COPY49:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_26]], [[S_MOV_B32_]], 0, 480, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_25]], [[S_MOV_B32_]], 0, 496, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_24]], [[S_MOV_B32_]], 0, 448, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_23]], [[S_MOV_B32_]], 0, 464, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_22]], [[S_MOV_B32_]], 0, 416, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_21]], [[S_MOV_B32_]], 0, 432, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_20:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_20]], [[S_MOV_B32_]], 0, 384, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef [[COPY50:%[0-9]+]].sub0:vreg_128 = COPY [[COPY49]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY50]].sub2:vreg_128 = COPY [[COPY49]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY50]], [[S_MOV_B32_]], 0, 400, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY51:%[0-9]+]].sub0:vreg_128 = COPY [[COPY48]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY51]].sub2:vreg_128 = COPY [[COPY48]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY51:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY51:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY51]], [[S_MOV_B32_]], 0, 352, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef [[COPY52:%[0-9]+]].sub0:vreg_128 = COPY [[COPY47]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY52]].sub2:vreg_128 = COPY [[COPY47]].sub2
+    ; CHECK-NEXT: undef [[COPY50:%[0-9]+]].sub2:vreg_128 = COPY [[V_LSHRREV_B32_e32_31]].sub2
+    ; CHECK-NEXT: [[COPY50:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: undef [[COPY51:%[0-9]+]].sub0:vreg_128 = COPY [[COPY50]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY51]].sub2:vreg_128 = COPY [[COPY50]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub1, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_1]], [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub1:vreg_128 = V_MOV_B32_e32 0, implicit $exec
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_27:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_27]], [[S_MOV_B32_]], 0, 480, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_26:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_26]], [[S_MOV_B32_]], 0, 496, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_25:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_25]], [[S_MOV_B32_]], 0, 448, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_24:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_24]], [[S_MOV_B32_]], 0, 464, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_23:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_23]], [[S_MOV_B32_]], 0, 416, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_22:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_22]], [[S_MOV_B32_]], 0, 432, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[V_LSHRREV_B32_e32_21:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[V_LSHRREV_B32_e32_21]], [[S_MOV_B32_]], 0, 384, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY52:%[0-9]+]].sub0:vreg_128 = COPY [[COPY51]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY52]].sub2:vreg_128 = COPY [[COPY51]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY53:%[0-9]+]].sub0:vreg_128 = COPY [[COPY52]].sub0 {
     ; CHECK-NEXT:   internal [[COPY53]].sub2:vreg_128 = COPY [[COPY52]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY53]], [[S_MOV_B32_]], 0, 368, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY54:%[0-9]+]].sub0:vreg_128 = COPY [[COPY45]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY54]].sub2:vreg_128 = COPY [[COPY45]].sub2
+    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY53:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY53]], [[S_MOV_B32_]], 0, 400, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY54:%[0-9]+]].sub0:vreg_128 = COPY [[COPY49]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY54]].sub2:vreg_128 = COPY [[COPY49]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY55:%[0-9]+]].sub0:vreg_128 = COPY [[COPY54]].sub0 {
     ; CHECK-NEXT:   internal [[COPY55]].sub2:vreg_128 = COPY [[COPY54]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY55]], [[S_MOV_B32_]], 0, 320, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef [[COPY56:%[0-9]+]].sub0:vreg_128 = COPY [[COPY43]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY56]].sub2:vreg_128 = COPY [[COPY43]].sub2
+    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY55:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY55]], [[S_MOV_B32_]], 0, 352, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY56:%[0-9]+]].sub0:vreg_128 = COPY [[COPY47]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY56]].sub2:vreg_128 = COPY [[COPY47]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY57:%[0-9]+]].sub0:vreg_128 = COPY [[COPY56]].sub0 {
     ; CHECK-NEXT:   internal [[COPY57]].sub2:vreg_128 = COPY [[COPY56]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY57]], [[S_MOV_B32_]], 0, 336, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY58:%[0-9]+]].sub0:vreg_128 = COPY [[COPY41]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY58]].sub2:vreg_128 = COPY [[COPY41]].sub2
+    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY57:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY57]], [[S_MOV_B32_]], 0, 368, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY58:%[0-9]+]].sub0:vreg_128 = COPY [[COPY45]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY58]].sub2:vreg_128 = COPY [[COPY45]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY59:%[0-9]+]].sub0:vreg_128 = COPY [[COPY58]].sub0 {
     ; CHECK-NEXT:   internal [[COPY59]].sub2:vreg_128 = COPY [[COPY58]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY59]], [[S_MOV_B32_]], 0, 288, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef [[COPY60:%[0-9]+]].sub0:vreg_128 = COPY [[COPY39]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY60]].sub2:vreg_128 = COPY [[COPY39]].sub2
+    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY59:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY59]], [[S_MOV_B32_]], 0, 320, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY60:%[0-9]+]].sub0:vreg_128 = COPY [[COPY43]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY60]].sub2:vreg_128 = COPY [[COPY43]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY61:%[0-9]+]].sub0:vreg_128 = COPY [[COPY60]].sub0 {
     ; CHECK-NEXT:   internal [[COPY61]].sub2:vreg_128 = COPY [[COPY60]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY61]], [[S_MOV_B32_]], 0, 304, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY62:%[0-9]+]].sub0:vreg_128 = COPY [[COPY37]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY62]].sub2:vreg_128 = COPY [[COPY37]].sub2
+    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY61:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY61]], [[S_MOV_B32_]], 0, 336, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY62:%[0-9]+]].sub0:vreg_128 = COPY [[COPY41]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY62]].sub2:vreg_128 = COPY [[COPY41]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY63:%[0-9]+]].sub0:vreg_128 = COPY [[COPY62]].sub0 {
     ; CHECK-NEXT:   internal [[COPY63]].sub2:vreg_128 = COPY [[COPY62]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY63]], [[S_MOV_B32_]], 0, 256, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
-    ; CHECK-NEXT: undef [[COPY64:%[0-9]+]].sub0:vreg_128 = COPY [[COPY35]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY64]].sub2:vreg_128 = COPY [[COPY35]].sub2
+    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY63:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY63]], [[S_MOV_B32_]], 0, 288, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY64:%[0-9]+]].sub0:vreg_128 = COPY [[COPY39]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY64]].sub2:vreg_128 = COPY [[COPY39]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY65:%[0-9]+]].sub0:vreg_128 = COPY [[COPY64]].sub0 {
     ; CHECK-NEXT:   internal [[COPY65]].sub2:vreg_128 = COPY [[COPY64]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY65]], [[S_MOV_B32_]], 0, 272, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY66:%[0-9]+]].sub0:vreg_128 = COPY [[COPY33]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY66]].sub2:vreg_128 = COPY [[COPY33]].sub2
+    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY65:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY65]], [[S_MOV_B32_]], 0, 304, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY66:%[0-9]+]].sub0:vreg_128 = COPY [[COPY37]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY66]].sub2:vreg_128 = COPY [[COPY37]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY67:%[0-9]+]].sub0:vreg_128 = COPY [[COPY66]].sub0 {
     ; CHECK-NEXT:   internal [[COPY67]].sub2:vreg_128 = COPY [[COPY66]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY67]], [[S_MOV_B32_]], 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY68:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY68]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
+    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY67:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY67]], [[S_MOV_B32_]], 0, 256, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY68:%[0-9]+]].sub0:vreg_128 = COPY [[COPY35]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY68]].sub2:vreg_128 = COPY [[COPY35]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY69:%[0-9]+]].sub0:vreg_128 = COPY [[COPY68]].sub0 {
     ; CHECK-NEXT:   internal [[COPY69]].sub2:vreg_128 = COPY [[COPY68]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY69]], [[S_MOV_B32_]], 0, 240, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY70:%[0-9]+]].sub0:vreg_128 = COPY [[COPY29]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY70]].sub2:vreg_128 = COPY [[COPY29]].sub2
+    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY69:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY69]], [[S_MOV_B32_]], 0, 272, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY70:%[0-9]+]].sub0:vreg_128 = COPY [[COPY33]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY70]].sub2:vreg_128 = COPY [[COPY33]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY71:%[0-9]+]].sub0:vreg_128 = COPY [[COPY70]].sub0 {
     ; CHECK-NEXT:   internal [[COPY71]].sub2:vreg_128 = COPY [[COPY70]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY71]], [[S_MOV_B32_]], 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY72:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY72]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
+    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY71:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY71]], [[S_MOV_B32_]], 0, 224, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY72:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY72]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY73:%[0-9]+]].sub0:vreg_128 = COPY [[COPY72]].sub0 {
     ; CHECK-NEXT:   internal [[COPY73]].sub2:vreg_128 = COPY [[COPY72]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY73]], [[S_MOV_B32_]], 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY74:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY74]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
+    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY73:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY73]], [[S_MOV_B32_]], 0, 240, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY74:%[0-9]+]].sub0:vreg_128 = COPY [[COPY29]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY74]].sub2:vreg_128 = COPY [[COPY29]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY75:%[0-9]+]].sub0:vreg_128 = COPY [[COPY74]].sub0 {
     ; CHECK-NEXT:   internal [[COPY75]].sub2:vreg_128 = COPY [[COPY74]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY75]], [[S_MOV_B32_]], 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY76:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY76]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
+    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY75:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY75]], [[S_MOV_B32_]], 0, 192, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY76:%[0-9]+]].sub0:vreg_128 = COPY [[COPY27]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY76]].sub2:vreg_128 = COPY [[COPY27]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY77:%[0-9]+]].sub0:vreg_128 = COPY [[COPY76]].sub0 {
     ; CHECK-NEXT:   internal [[COPY77]].sub2:vreg_128 = COPY [[COPY76]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY77]], [[S_MOV_B32_]], 0, 176, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef [[COPY78:%[0-9]+]].sub0:vreg_128 = COPY [[COPY21]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY78]].sub2:vreg_128 = COPY [[COPY21]].sub2
+    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY77:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY77]], [[S_MOV_B32_]], 0, 208, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY78:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY78]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY79:%[0-9]+]].sub0:vreg_128 = COPY [[COPY78]].sub0 {
     ; CHECK-NEXT:   internal [[COPY79]].sub2:vreg_128 = COPY [[COPY78]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY79]], [[S_MOV_B32_]], 0, 128, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef [[COPY80:%[0-9]+]].sub0:vreg_128 = COPY [[COPY19]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY80]].sub2:vreg_128 = COPY [[COPY19]].sub2
+    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY79:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY79]], [[S_MOV_B32_]], 0, 160, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY80:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY80]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY81:%[0-9]+]].sub0:vreg_128 = COPY [[COPY80]].sub0 {
     ; CHECK-NEXT:   internal [[COPY81]].sub2:vreg_128 = COPY [[COPY80]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY81]], [[S_MOV_B32_]], 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY82:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY82]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
+    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY81:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY81]], [[S_MOV_B32_]], 0, 176, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef [[COPY82:%[0-9]+]].sub0:vreg_128 = COPY [[COPY21]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY82]].sub2:vreg_128 = COPY [[COPY21]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY83:%[0-9]+]].sub0:vreg_128 = COPY [[COPY82]].sub0 {
     ; CHECK-NEXT:   internal [[COPY83]].sub2:vreg_128 = COPY [[COPY82]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY83]], [[S_MOV_B32_]], 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY84:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY84]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
+    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY83:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY83]], [[S_MOV_B32_]], 0, 128, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef [[COPY84:%[0-9]+]].sub0:vreg_128 = COPY [[COPY19]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY84]].sub2:vreg_128 = COPY [[COPY19]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY85:%[0-9]+]].sub0:vreg_128 = COPY [[COPY84]].sub0 {
     ; CHECK-NEXT:   internal [[COPY85]].sub2:vreg_128 = COPY [[COPY84]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY85]], [[S_MOV_B32_]], 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY86:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY86]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
+    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY85:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY85]], [[S_MOV_B32_]], 0, 144, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY86:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY86]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY87:%[0-9]+]].sub0:vreg_128 = COPY [[COPY86]].sub0 {
     ; CHECK-NEXT:   internal [[COPY87]].sub2:vreg_128 = COPY [[COPY86]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY87]], [[S_MOV_B32_]], 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY88:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY88]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
+    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY87:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY87]], [[S_MOV_B32_]], 0, 96, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY88:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY88]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY89:%[0-9]+]].sub0:vreg_128 = COPY [[COPY88]].sub0 {
     ; CHECK-NEXT:   internal [[COPY89]].sub2:vreg_128 = COPY [[COPY88]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY89]], [[S_MOV_B32_]], 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY90:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY90]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
+    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY89:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY89]], [[S_MOV_B32_]], 0, 112, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY90:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY90]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY91:%[0-9]+]].sub0:vreg_128 = COPY [[COPY90]].sub0 {
     ; CHECK-NEXT:   internal [[COPY91]].sub2:vreg_128 = COPY [[COPY90]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY91]], [[S_MOV_B32_]], 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY92:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY92]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
+    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY91:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY91]], [[S_MOV_B32_]], 0, 64, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY92:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY92]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY93:%[0-9]+]].sub0:vreg_128 = COPY [[COPY92]].sub0 {
     ; CHECK-NEXT:   internal [[COPY93]].sub2:vreg_128 = COPY [[COPY92]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY93]], [[S_MOV_B32_]], 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY94:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY94]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
+    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY93:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY93]], [[S_MOV_B32_]], 0, 80, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY94:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY94]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY95:%[0-9]+]].sub0:vreg_128 = COPY [[COPY94]].sub0 {
     ; CHECK-NEXT:   internal [[COPY95]].sub2:vreg_128 = COPY [[COPY94]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY95]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef [[COPY96:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
-    ; CHECK-NEXT:   internal [[COPY96]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
+    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY95:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY95]], [[S_MOV_B32_]], 0, 32, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY96:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY96]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef [[COPY97:%[0-9]+]].sub0:vreg_128 = COPY [[COPY96]].sub0 {
     ; CHECK-NEXT:   internal [[COPY97]].sub2:vreg_128 = COPY [[COPY96]].sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_26]].sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY97]], [[S_MOV_B32_]], 0, 16, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY97:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY97]], [[S_MOV_B32_]], 0, 48, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY98:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY98]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY99:%[0-9]+]].sub0:vreg_128 = COPY [[COPY98]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY99]].sub2:vreg_128 = COPY [[COPY98]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY99:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY99:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY99]], [[S_MOV_B32_]], 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef [[COPY100:%[0-9]+]].sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY100]].sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef [[COPY101:%[0-9]+]].sub0:vreg_128 = COPY [[COPY100]].sub0 {
+    ; CHECK-NEXT:   internal [[COPY101]].sub2:vreg_128 = COPY [[COPY100]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: [[COPY101:%[0-9]+]].sub1:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: [[COPY101:%[0-9]+]].sub3:vreg_128 = COPY [[V_LSHRREV_B32_e32_27]].sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET [[COPY101]], [[S_MOV_B32_]], 0, 16, 0, 0, implicit $exec :: (store (s128), addrspace 1)
     ; CHECK-NEXT: S_ENDPGM 0
     %0:sgpr_64(p4) = COPY $sgpr0_sgpr1
     %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0(p4), 9, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
diff --git a/llvm/test/CodeGen/AVR/inline-asm/inline-asm3.ll b/llvm/test/CodeGen/AVR/inline-asm/inline-asm3.ll
index 07839a43331f08..5ed012c72150f8 100644
--- a/llvm/test/CodeGen/AVR/inline-asm/inline-asm3.ll
+++ b/llvm/test/CodeGen/AVR/inline-asm/inline-asm3.ll
@@ -224,15 +224,15 @@ define void @add_b_i16(i16 signext %0, i16 signext %1) {
 define void @add_e_i8(i8 signext %0, i8 signext %1) {
 ; CHECK-LABEL: add_e_i8:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov r30, r22
-; CHECK-NEXT:    mov r22, r24
-; CHECK-NEXT:    mov r26, r22
+; CHECK-NEXT:    mov r20, r22
+; CHECK-NEXT:    mov r30, r24
+; CHECK-NEXT:    mov r26, r20
 ; CHECK-NEXT:    ;APP
-; CHECK-NEXT:    mov r26, r26
-; CHECK-NEXT:    add r26, r30
+; CHECK-NEXT:    mov r26, r30
+; CHECK-NEXT:    lsl r26
 ; CHECK-NEXT:    ;NO_APP
-; CHECK-NEXT:    mov r20, r30
 ; CHECK-NEXT:    mov r24, r26
+; CHECK-NEXT:    mov r22, r30
 ; CHECK-NEXT:    rcall foo8
 ; CHECK-NEXT:    ret
   %3 = tail call i8 asm sideeffect "mov $0, $1\0Aadd $0, $2", "=e,e,e"(i8 %0, i8 %1)
@@ -243,22 +243,22 @@ define void @add_e_i8(i8 signext %0, i8 signext %1) {
 define void @add_e_i16(i16 signext %0, i16 signext %1) {
 ; CHECK-LABEL: add_e_i16:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov r30, r22
-; CHECK-NEXT:    mov r31, r23
-; CHECK-NEXT:    mov r22, r24
-; CHECK-NEXT:    mov r23, r25
-; CHECK-NEXT:    mov r26, r22
-; CHECK-NEXT:    mov r27, r23
+; CHECK-NEXT:    mov r20, r22
+; CHECK-NEXT:    mov r21, r23
+; CHECK-NEXT:    mov r30, r24
+; CHECK-NEXT:    mov r31, r25
+; CHECK-NEXT:    mov r26, r20
+; CHECK-NEXT:    mov r27, r21
 ; CHECK-NEXT:    ;APP
-; CHECK-NEXT:    mov r26, r26
-; CHECK-NEXT:    mov r27, r27
-; CHECK-NEXT:    add r26, r30
-; CHECK-NEXT:    adc r27, r31
+; CHECK-NEXT:    mov r26, r30
+; CHECK-NEXT:    mov r27, r31
+; CHECK-NEXT:    lsl r26
+; CHECK-NEXT:    rol r27
 ; CHECK-NEXT:    ;NO_APP
 ; CHECK-NEXT:    mov r24, r26
 ; CHECK-NEXT:    mov r25, r27
-; CHECK-NEXT:    mov r20, r30
-; CHECK-NEXT:    mov r21, r31
+; CHECK-NEXT:    mov r22, r30
+; CHECK-NEXT:    mov r23, r31
 ; CHECK-NEXT:    rcall foo16
 ; CHECK-NEXT:    ret
   %3 = tail call i16 asm sideeffect "mov ${0:A}, ${1:A}\0Amov ${0:B}, ${1:B}\0Aadd ${0:A}, ${2:A}\0Aadc ${0:B}, ${2:B}", "=e,e,e"(i16 %0, i16 %1)
diff --git a/llvm/test/CodeGen/Hexagon/packetize-impdef-1.ll b/llvm/test/CodeGen/Hexagon/packetize-impdef-1.ll
index 6e84602fb7eaa2..f4df0c044d2e31 100644
--- a/llvm/test/CodeGen/Hexagon/packetize-impdef-1.ll
+++ b/llvm/test/CodeGen/Hexagon/packetize-impdef-1.ll
@@ -9,9 +9,7 @@
 ; r1 = IMPLICIT_DEF
 ; S2_storerd_io r29, 0, d0
 
-; CHECK: memd(r29+#0) = r{{[0-9]+}}:{{[0-9]+}}
-; CHECK: memd(r29+#0) = r{{[0-9]+}}:{{[0-9]+}}
-
+; CHECK: memw(r29+#{{[0-9]+}}) = r{{[0-9]+}}.new
 define ptr @f0(ptr %a0) local_unnamed_addr {
 b0:
   %v0 = tail call ptr @f1(i32 0)
diff --git a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
index d291af97d3afd9..6996fa3f1c1d98 100644
--- a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
+++ b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
@@ -1,9 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -march=hexagon -enable-subreg-liveness -start-after machine-scheduler -stop-after stack-slot-coloring -o - %s | FileCheck %s
 
 --- |
   target triple = "hexagon"
 
-  ; Function Attrs: nounwind optsize
   define void @main() #0 {
   entry:
     br label %for.body
@@ -142,6 +142,63 @@ frameInfo:
   hasVAStart:      false
   hasMustTailInVarArgFunc: false
 body:             |
+  ; CHECK-LABEL: name: main
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r0 = A2_tfrsi 524288
+  ; CHECK-NEXT:   renamable $r17 = A2_tfrsi 0
+  ; CHECK-NEXT:   renamable $d11 = S2_extractup undef renamable $d0, 6, 25
+  ; CHECK-NEXT:   $r23 = A2_tfrsi 524288
+  ; CHECK-NEXT:   renamable $d0 = A2_tfrpi 2
+  ; CHECK-NEXT:   renamable $d13 = A2_tfrpi -1
+  ; CHECK-NEXT:   renamable $d13 = S2_asl_r_p_acc killed renamable $d13, killed renamable $d0, renamable $r22
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.for.body:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $d8:0x0000000000000001, $d11:0x0000000000000002, $d13, $r23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+  ; CHECK-NEXT:   J2_call @lrand48, implicit-def dead $d0, implicit-def dead $d1, implicit-def dead $d2, implicit-def dead $d3, implicit-def dead $d4, implicit-def dead $d5, implicit-def dead $d6, implicit-def dead $d7, implicit-def dead $r28, implicit-def dead $r31, implicit-def dead $p0, implicit-def dead $p1, implicit-def dead $p2, implicit-def dead $p3, implicit-def dead $m0, implicit-def dead $m1, implicit-def dead $lc0, implicit-def dead $lc1, implicit-def dead $sa0, implicit-def dead $sa1, implicit-def dead $usr, implicit-def $usr_ovf, implicit-def dead $cs0, implicit-def dead $cs1, implicit-def dead $w0, implicit-def dead $w1, implicit-def dead $w2, implicit-def dead $w3, implicit-def dead $w4, implicit-def dead $w5, implicit-def dead $w6, implicit-def dead $w7, implicit-def dead $w8, implicit-def dead $w9, implicit-def dead $w10, implicit-def dead $w11, implicit-def dead $w12, implicit-def dead $w13, implicit-def dead $w14, implicit-def dead $w15, implicit-def dead $q0, implicit-def dead $q1, implicit-def dead $q2, implicit-def dead $q3, implicit-def $r0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+  ; CHECK-NEXT:   renamable $r18 = COPY $r0
+  ; CHECK-NEXT:   renamable $r19 = S2_asr_i_r renamable $r18, 31
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+  ; CHECK-NEXT:   J2_call @lrand48, implicit-def dead $d0, implicit-def dead $d1, implicit-def dead $d2, implicit-def dead $d3, implicit-def dead $d4, implicit-def dead $d5, implicit-def dead $d6, implicit-def dead $d7, implicit-def dead $r28, implicit-def dead $r31, implicit-def dead $p0, implicit-def dead $p1, implicit-def dead $p2, implicit-def dead $p3, implicit-def dead $m0, implicit-def dead $m1, implicit-def dead $lc0, implicit-def dead $lc1, implicit-def dead $sa0, implicit-def dead $sa1, implicit-def dead $usr, implicit-def $usr_ovf, implicit-def dead $cs0, implicit-def dead $cs1, implicit-def dead $w0, implicit-def dead $w1, implicit-def dead $w2, implicit-def dead $w3, implicit-def dead $w4, implicit-def dead $w5, implicit-def dead $w6, implicit-def dead $w7, implicit-def dead $w8, implicit-def dead $w9, implicit-def dead $w10, implicit-def dead $w11, implicit-def dead $w12, implicit-def dead $w13, implicit-def dead $w14, implicit-def dead $w15, implicit-def dead $q0, implicit-def dead $q1, implicit-def dead $q2, implicit-def dead $q3, implicit-def $r0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+  ; CHECK-NEXT:   renamable $r16 = COPY $r0
+  ; CHECK-NEXT:   renamable $d12 = S2_extractup renamable $d8, 22, 9
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+  ; CHECK-NEXT:   J2_call @lrand48, implicit-def dead $d0, implicit-def dead $d1, implicit-def dead $d2, implicit-def dead $d3, implicit-def dead $d4, implicit-def dead $d5, implicit-def dead $d6, implicit-def dead $d7, implicit-def dead $r28, implicit-def dead $r31, implicit-def dead $p0, implicit-def dead $p1, implicit-def dead $p2, implicit-def dead $p3, implicit-def dead $m0, implicit-def dead $m1, implicit-def dead $lc0, implicit-def dead $lc1, implicit-def dead $sa0, implicit-def dead $sa1, implicit-def dead $usr, implicit-def $usr_ovf, implicit-def dead $cs0, implicit-def dead $cs1, implicit-def dead $w0, implicit-def dead $w1, implicit-def dead $w2, implicit-def dead $w3, implicit-def dead $w4, implicit-def dead $w5, implicit-def dead $w6, implicit-def dead $w7, implicit-def dead $w8, implicit-def dead $w9, implicit-def dead $w10, implicit-def dead $w11, implicit-def dead $w12, implicit-def dead $w13, implicit-def dead $w14, implicit-def dead $w15, implicit-def dead $q0, implicit-def dead $q1, implicit-def dead $q2, implicit-def dead $q3, implicit-def $r0
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+  ; CHECK-NEXT:   renamable $r1 = COPY renamable $r17
+  ; CHECK-NEXT:   renamable $d0 = S2_extractup killed renamable $d0, 6, 25
+  ; CHECK-NEXT:   renamable $d10 = A2_tfrpi -1
+  ; CHECK-NEXT:   renamable $d1 = A2_tfrpi 2
+  ; CHECK-NEXT:   renamable $d10 = S2_asl_r_p_acc killed renamable $d10, killed renamable $d1, killed renamable $r0
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
+  ; CHECK-NEXT:   J2_call @lrand48, implicit-def dead $d0, implicit-def dead $d1, implicit-def dead $d2, implicit-def dead $d3, implicit-def dead $d4, implicit-def dead $d5, implicit-def dead $d6, implicit-def dead $d7, implicit-def dead $r28, implicit-def dead $r31, implicit-def dead $p0, implicit-def dead $p1, implicit-def dead $p2, implicit-def dead $p3, implicit-def dead $m0, implicit-def dead $m1, implicit-def dead $lc0, implicit-def dead $lc1, implicit-def dead $sa0, implicit-def dead $sa1, implicit-def dead $usr, implicit-def $usr_ovf, implicit-def dead $cs0, implicit-def dead $cs1, implicit-def dead $w0, implicit-def dead $w1, implicit-def dead $w2, implicit-def dead $w3, implicit-def dead $w4, implicit-def dead $w5, implicit-def dead $w6, implicit-def dead $w7, implicit-def dead $w8, implicit-def dead $w9, implicit-def dead $w10, implicit-def dead $w11, implicit-def dead $w12, implicit-def dead $w13, implicit-def dead $w14, implicit-def dead $w15, implicit-def dead $q0, implicit-def dead $q1, implicit-def dead $q2, implicit-def dead $q3
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
+  ; CHECK-NEXT:   renamable $d0 = S2_asl_r_p killed renamable $d10, renamable $r22
+  ; CHECK-NEXT:   renamable $d1 = COPY renamable $d13
+  ; CHECK-NEXT:   renamable $d1 = S2_lsr_i_p_and killed renamable $d1, killed renamable $d9, 9
+  ; CHECK-NEXT:   renamable $d0 = S2_asl_i_p_and killed renamable $d0, killed renamable $d12, 42
+  ; CHECK-NEXT:   S2_storerd_io undef renamable $r0, 0, killed renamable $d0 :: (store (s64) into `ptr undef`)
+  ; CHECK-NEXT:   renamable $d0 = A2_tfrpi 0
+  ; CHECK-NEXT:   renamable $p0 = C2_cmpeqp killed renamable $d1, killed renamable $d0
+  ; CHECK-NEXT:   J2_jumpt killed renamable $p0, %bb.3, implicit-def dead $pc
+  ; CHECK-NEXT:   J2_jump %bb.2, implicit-def dead $pc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.if.end82:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT:   liveins: $d8:0x0000000000000001, $d11:0x0000000000000002, $d13, $r23
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r23 = A2_addi killed renamable $r23, -1
+  ; CHECK-NEXT:   renamable $p0 = C2_cmpeqi renamable $r23, 0
+  ; CHECK-NEXT:   J2_jumpf killed renamable $p0, %bb.1, implicit-def dead $pc
+  ; CHECK-NEXT:   J2_jump %bb.3, implicit-def dead $pc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.for.end:
   bb.0.entry:
     successors: %bb.1.for.body
 
@@ -153,11 +210,6 @@ body:             |
     %13 = S2_asl_r_p_acc %13, %47, %8.isub_lo
     %51 = A2_tfrpi 0
 
-    ; CHECK: $d2 = S2_extractup undef renamable $d0, 6, 25
-    ; CHECK: $d0 = A2_tfrpi 2
-    ; CHECK: $d13 = A2_tfrpi -1
-    ; CHECK-NOT: undef $r4
-
   bb.1.for.body:
     successors: %bb.3.for.end, %bb.2.if.end82
 
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
index ac9641ff35b0cb..ad885fb1626a6b 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll
@@ -44,8 +44,13 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    # implicit-def: $r5
 ; CHECK-NEXT:  .LBB0_5: # %bb16
-; CHECK-NEXT:    crnot 4*cr1+lt, eq
 ; CHECK-NEXT:    crnot 4*cr5+un, 4*cr5+eq
+; CHECK-NEXT:    mfocrf r4, 4
+; CHECK-NEXT:    mfcr r12
+; CHECK-NEXT:    crnot 4*cr1+lt, eq
+; CHECK-NEXT:    rotlwi r4, r4, 20
+; CHECK-NEXT:    stw r12, 8(r1)
+; CHECK-NEXT:    stw r4, -4(r1)
 ; CHECK-NEXT:    bc 12, 4*cr5+eq, .LBB0_7
 ; CHECK-NEXT:  # %bb.6: # %bb18
 ; CHECK-NEXT:    lwz r4, 0(r3)
@@ -53,15 +58,13 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:  .LBB0_7:
 ; CHECK-NEXT:    # implicit-def: $r4
 ; CHECK-NEXT:  .LBB0_8: # %bb20
-; CHECK-NEXT:    mfcr r12
 ; CHECK-NEXT:    cmpwi cr2, r3, -1
 ; CHECK-NEXT:    cmpwi cr3, r4, -1
-; CHECK-NEXT:    stw r12, 8(r1)
 ; CHECK-NEXT:    cmpwi cr7, r3, 0
 ; CHECK-NEXT:    cmpwi cr6, r4, 0
+; CHECK-NEXT:    # implicit-def: $x3
 ; CHECK-NEXT:    crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt
 ; CHECK-NEXT:    crand 4*cr5+lt, 4*cr3+gt, 4*cr5+un
-; CHECK-NEXT:    # implicit-def: $x3
 ; CHECK-NEXT:    bc 4, 4*cr5+gt, .LBB0_10
 ; CHECK-NEXT:  # %bb.9: # %bb34
 ; CHECK-NEXT:    ld r3, 0(r3)
@@ -87,8 +90,8 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:  # %bb.15: # %bb52
 ; CHECK-NEXT:    lwz r5, 0(r3)
 ; CHECK-NEXT:  .LBB0_16: # %bb54
-; CHECK-NEXT:    mfocrf r7, 128
-; CHECK-NEXT:    stw r7, -4(r1)
+; CHECK-NEXT:    crmove 4*cr5+eq, 4*cr1+eq
+; CHECK-NEXT:    crmove 4*cr1+eq, eq
 ; CHECK-NEXT:    # implicit-def: $r7
 ; CHECK-NEXT:    bc 4, 4*cr7+lt, .LBB0_18
 ; CHECK-NEXT:  # %bb.17: # %bb56
@@ -131,12 +134,12 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    isel r6, 0, r6, 4*cr6+gt
 ; CHECK-NEXT:    mtocrf 128, r9
 ; CHECK-NEXT:    mtfprd f0, r5
-; CHECK-NEXT:    isel r4, 0, r4, 4*cr5+eq
+; CHECK-NEXT:    isel r3, 0, r3, 4*cr1+eq
 ; CHECK-NEXT:    mtocrf 32, r12
 ; CHECK-NEXT:    mtocrf 16, r12
 ; CHECK-NEXT:    mtocrf 8, r12
-; CHECK-NEXT:    iseleq r3, 0, r3
-; CHECK-NEXT:    isel r6, 0, r6, 4*cr1+eq
+; CHECK-NEXT:    isel r6, 0, r6, 4*cr5+eq
+; CHECK-NEXT:    iseleq r4, 0, r4
 ; CHECK-NEXT:    xscvsxddp f0, f0
 ; CHECK-NEXT:    add r3, r6, r3
 ; CHECK-NEXT:    add r3, r4, r3
diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
index f4e49d8b96cf8e..d2e3dc9f8c4302 100644
--- a/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
+++ b/llvm/test/CodeGen/PowerPC/p10-spill-crgt.ll
@@ -33,38 +33,38 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    lwz r3, 0(r3)
 ; CHECK-NEXT:    std r29, 40(r1) # 8-byte Folded Spill
 ; CHECK-NEXT:    std r30, 48(r1) # 8-byte Folded Spill
-; CHECK-NEXT:    crxor 4*cr2+eq, 4*cr2+eq, 4*cr2+eq
+; CHECK-NEXT:    crxor 4*cr3+gt, 4*cr3+gt, 4*cr3+gt
 ; CHECK-NEXT:    paddi r29, 0, .LJTI0_0 at PCREL, 1
 ; CHECK-NEXT:    srwi r4, r3, 4
 ; CHECK-NEXT:    srwi r3, r3, 5
 ; CHECK-NEXT:    andi. r4, r4, 1
 ; CHECK-NEXT:    li r4, 0
-; CHECK-NEXT:    crmove 4*cr2+gt, gt
+; CHECK-NEXT:    crmove 4*cr3+lt, gt
 ; CHECK-NEXT:    andi. r3, r3, 1
 ; CHECK-NEXT:    li r3, 0
-; CHECK-NEXT:    crmove 4*cr2+lt, gt
+; CHECK-NEXT:    crmove 4*cr3+eq, gt
 ; CHECK-NEXT:    sldi r30, r3, 2
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: # %bb43
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    bl call_1 at notoc
-; CHECK-NEXT:    setnbc r3, 4*cr3+eq
+; CHECK-NEXT:    setnbc r3, 4*cr2+eq
 ; CHECK-NEXT:    li r4, 0
 ; CHECK-NEXT:    stb r4, 0(r3)
 ; CHECK-NEXT:    li r4, 0
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_2: # %bb5
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 4*cr2+gt, .LBB0_31
+; CHECK-NEXT:    bc 12, 4*cr3+lt, .LBB0_31
 ; CHECK-NEXT:  # %bb.3: # %bb10
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    bc 12, 4*cr2+eq, .LBB0_5
+; CHECK-NEXT:    bc 12, 4*cr3+gt, .LBB0_5
 ; CHECK-NEXT:  # %bb.4: # %bb10
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    mr r3, r4
 ; CHECK-NEXT:    lwz r5, 0(r3)
 ; CHECK-NEXT:    rlwinm r4, r5, 0, 21, 22
-; CHECK-NEXT:    cmpwi cr3, r4, 512
+; CHECK-NEXT:    cmpwi cr2, r4, 512
 ; CHECK-NEXT:    lwax r4, r30, r29
 ; CHECK-NEXT:    add r4, r4, r29
 ; CHECK-NEXT:    mtctr r4
@@ -186,11 +186,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-NEXT:    mtocrf 8, r12
 ; CHECK-NEXT:    blr
 ; CHECK-NEXT:  .LBB0_32: # %bb29
-; CHECK-NEXT:    crmove eq, 4*cr3+eq
+; CHECK-NEXT:    crmove 4*cr2+lt, 4*cr3+eq
 ; CHECK-NEXT:    cmpwi cr3, r5, 366
 ; CHECK-NEXT:    cmpwi cr4, r3, 0
+; CHECK-NEXT:    setnbc r30, 4*cr2+eq
 ; CHECK-NEXT:    li r29, 0
-; CHECK-NEXT:    setnbc r30, eq
 ; CHECK-NEXT:    bc 12, 4*cr2+lt, .LBB0_36
 ; CHECK-NEXT:    .p2align 5
 ; CHECK-NEXT:  .LBB0_33: # %bb36
@@ -224,15 +224,15 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    lwz r3, 0(r3)
 ; CHECK-BE-NEXT:    std r29, 120(r1) # 8-byte Folded Spill
 ; CHECK-BE-NEXT:    std r30, 128(r1) # 8-byte Folded Spill
-; CHECK-BE-NEXT:    crxor 4*cr2+eq, 4*cr2+eq, 4*cr2+eq
+; CHECK-BE-NEXT:    crxor 4*cr3+gt, 4*cr3+gt, 4*cr3+gt
 ; CHECK-BE-NEXT:    srwi r4, r3, 4
 ; CHECK-BE-NEXT:    srwi r3, r3, 5
 ; CHECK-BE-NEXT:    andi. r4, r4, 1
 ; CHECK-BE-NEXT:    li r4, 0
-; CHECK-BE-NEXT:    crmove 4*cr2+gt, gt
+; CHECK-BE-NEXT:    crmove 4*cr3+lt, gt
 ; CHECK-BE-NEXT:    andi. r3, r3, 1
 ; CHECK-BE-NEXT:    li r3, 0
-; CHECK-BE-NEXT:    crmove 4*cr2+lt, gt
+; CHECK-BE-NEXT:    crmove 4*cr3+eq, gt
 ; CHECK-BE-NEXT:    sldi r30, r3, 2
 ; CHECK-BE-NEXT:    addis r3, r2, .LC0 at toc@ha
 ; CHECK-BE-NEXT:    ld r29, .LC0 at toc@l(r3)
@@ -241,23 +241,23 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    bl call_1
 ; CHECK-BE-NEXT:    nop
-; CHECK-BE-NEXT:    setnbc r3, 4*cr3+eq
+; CHECK-BE-NEXT:    setnbc r3, 4*cr2+eq
 ; CHECK-BE-NEXT:    li r4, 0
 ; CHECK-BE-NEXT:    stb r4, 0(r3)
 ; CHECK-BE-NEXT:    li r4, 0
 ; CHECK-BE-NEXT:    .p2align 4
 ; CHECK-BE-NEXT:  .LBB0_2: # %bb5
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    bc 12, 4*cr2+gt, .LBB0_31
+; CHECK-BE-NEXT:    bc 12, 4*cr3+lt, .LBB0_31
 ; CHECK-BE-NEXT:  # %bb.3: # %bb10
 ; CHECK-BE-NEXT:    #
-; CHECK-BE-NEXT:    bc 12, 4*cr2+eq, .LBB0_5
+; CHECK-BE-NEXT:    bc 12, 4*cr3+gt, .LBB0_5
 ; CHECK-BE-NEXT:  # %bb.4: # %bb10
 ; CHECK-BE-NEXT:    #
 ; CHECK-BE-NEXT:    mr r3, r4
 ; CHECK-BE-NEXT:    lwz r5, 0(r3)
 ; CHECK-BE-NEXT:    rlwinm r4, r5, 0, 21, 22
-; CHECK-BE-NEXT:    cmpwi cr3, r4, 512
+; CHECK-BE-NEXT:    cmpwi cr2, r4, 512
 ; CHECK-BE-NEXT:    lwax r4, r30, r29
 ; CHECK-BE-NEXT:    add r4, r4, r29
 ; CHECK-BE-NEXT:    mtctr r4
@@ -379,11 +379,11 @@ define dso_local fastcc void @P10_Spill_CR_GT() unnamed_addr {
 ; CHECK-BE-NEXT:    mtocrf 8, r12
 ; CHECK-BE-NEXT:    blr
 ; CHECK-BE-NEXT:  .LBB0_32: # %bb29
-; CHECK-BE-NEXT:    crmove eq, 4*cr3+eq
+; CHECK-BE-NEXT:    crmove 4*cr2+lt, 4*cr3+eq
 ; CHECK-BE-NEXT:    cmpwi cr3, r5, 366
 ; CHECK-BE-NEXT:    cmpwi cr4, r3, 0
+; CHECK-BE-NEXT:    setnbc r30, 4*cr2+eq
 ; CHECK-BE-NEXT:    li r29, 0
-; CHECK-BE-NEXT:    setnbc r30, eq
 ; CHECK-BE-NEXT:    bc 12, 4*cr2+lt, .LBB0_36
 ; CHECK-BE-NEXT:    .p2align 4
 ; CHECK-BE-NEXT:  .LBB0_33: # %bb36
diff --git a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
index 009fd6ce82679b..5604acbfc94ccb 100644
--- a/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
+++ b/llvm/test/CodeGen/SystemZ/cond-move-regalloc-hints.mir
@@ -2,7 +2,7 @@
 # RUN:   | FileCheck %s
 #
 # Test that the reg alloc hints are given in a good order that gives no more
-# than 5 LGRs in output.
+# than 6 LGRs in output.
 
 --- |
   ; ModuleID = 'tc.ll'
@@ -113,6 +113,7 @@
 # CHECK: lgr
 # CHECK: lgr
 # CHECK: lgr
+# CHECK: lgr
 # CHECK-NOT: lgr
 
 ---
diff --git a/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll b/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll
index d0bfe74719f89b..7b68a781948ee1 100644
--- a/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll
+++ b/llvm/test/CodeGen/SystemZ/fmuladd-soft-float.ll
@@ -88,7 +88,8 @@ define double @fmuladd_contract_f64(double %a, double %b, double %c) #0 {
 define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
 ; SOFT-FLOAT-LABEL: fmuladd_contract_v4f32:
 ; SOFT-FLOAT:       # %bb.0:
-; SOFT-FLOAT-NEXT:    stmg %r7, %r15, 56(%r15)
+; SOFT-FLOAT-NEXT:    stmg %r6, %r15, 48(%r15)
+; SOFT-FLOAT-NEXT:    .cfi_offset %r6, -112
 ; SOFT-FLOAT-NEXT:    .cfi_offset %r7, -104
 ; SOFT-FLOAT-NEXT:    .cfi_offset %r8, -96
 ; SOFT-FLOAT-NEXT:    .cfi_offset %r9, -88
@@ -102,17 +103,17 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x
 ; SOFT-FLOAT-NEXT:    .cfi_def_cfa_offset 336
 ; SOFT-FLOAT-NEXT:    llgf %r0, 388(%r15)
 ; SOFT-FLOAT-NEXT:    stg %r0, 168(%r15) # 8-byte Folded Spill
-; SOFT-FLOAT-NEXT:    llgf %r0, 380(%r15)
-; SOFT-FLOAT-NEXT:    stg %r0, 160(%r15) # 8-byte Folded Spill
+; SOFT-FLOAT-NEXT:    llgf %r12, 380(%r15)
 ; SOFT-FLOAT-NEXT:    llgf %r11, 372(%r15)
 ; SOFT-FLOAT-NEXT:    llgf %r10, 364(%r15)
 ; SOFT-FLOAT-NEXT:    llgf %r8, 340(%r15)
 ; SOFT-FLOAT-NEXT:    llgf %r0, 356(%r15)
 ; SOFT-FLOAT-NEXT:    llgf %r7, 348(%r15)
 ; SOFT-FLOAT-NEXT:    llgfr %r1, %r5
+; SOFT-FLOAT-NEXT:    st %r6, 164(%r15) # 4-byte Folded Spill
 ; SOFT-FLOAT-NEXT:    lr %r9, %r4
 ; SOFT-FLOAT-NEXT:    lr %r13, %r3
-; SOFT-FLOAT-NEXT:    lr %r12, %r2
+; SOFT-FLOAT-NEXT:    lr %r6, %r2
 ; SOFT-FLOAT-NEXT:    lgr %r2, %r1
 ; SOFT-FLOAT-NEXT:    lgr %r3, %r0
 ; SOFT-FLOAT-NEXT:    brasl %r14, __mulsf3 at PLT
@@ -126,20 +127,20 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x
 ; SOFT-FLOAT-NEXT:    lgr %r2, %r0
 ; SOFT-FLOAT-NEXT:    lgr %r3, %r8
 ; SOFT-FLOAT-NEXT:    brasl %r14, __mulsf3 at PLT
-; SOFT-FLOAT-NEXT:    llgfr %r0, %r12
-; SOFT-FLOAT-NEXT:    llgfr %r3, %r6
-; SOFT-FLOAT-NEXT:    lgr %r12, %r2
+; SOFT-FLOAT-NEXT:    llgfr %r0, %r6
+; SOFT-FLOAT-NEXT:    llgf %r3, 164(%r15) # 4-byte Folded Reload
+; SOFT-FLOAT-NEXT:    lgr %r8, %r2
 ; SOFT-FLOAT-NEXT:    lgr %r2, %r0
 ; SOFT-FLOAT-NEXT:    brasl %r14, __mulsf3 at PLT
 ; SOFT-FLOAT-NEXT:    lgr %r3, %r10
 ; SOFT-FLOAT-NEXT:    brasl %r14, __addsf3 at PLT
 ; SOFT-FLOAT-NEXT:    lgr %r10, %r2
-; SOFT-FLOAT-NEXT:    lgr %r2, %r12
+; SOFT-FLOAT-NEXT:    lgr %r2, %r8
 ; SOFT-FLOAT-NEXT:    lgr %r3, %r11
 ; SOFT-FLOAT-NEXT:    brasl %r14, __addsf3 at PLT
-; SOFT-FLOAT-NEXT:    lgr %r12, %r2
+; SOFT-FLOAT-NEXT:    lgr %r11, %r2
 ; SOFT-FLOAT-NEXT:    lgr %r2, %r13
-; SOFT-FLOAT-NEXT:    lg %r3, 160(%r15) # 8-byte Folded Reload
+; SOFT-FLOAT-NEXT:    lgr %r3, %r12
 ; SOFT-FLOAT-NEXT:    brasl %r14, __addsf3 at PLT
 ; SOFT-FLOAT-NEXT:    lgr %r13, %r2
 ; SOFT-FLOAT-NEXT:    lgr %r2, %r9
@@ -147,10 +148,10 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x
 ; SOFT-FLOAT-NEXT:    brasl %r14, __addsf3 at PLT
 ; SOFT-FLOAT-NEXT:    lgr %r5, %r2
 ; SOFT-FLOAT-NEXT:    lr %r2, %r10
-; SOFT-FLOAT-NEXT:    lr %r3, %r12
+; SOFT-FLOAT-NEXT:    lr %r3, %r11
 ; SOFT-FLOAT-NEXT:    lr %r4, %r13
 ; SOFT-FLOAT-NEXT:    # kill: def $r5l killed $r5l killed $r5d
-; SOFT-FLOAT-NEXT:    lmg %r7, %r15, 232(%r15)
+; SOFT-FLOAT-NEXT:    lmg %r6, %r15, 224(%r15)
 ; SOFT-FLOAT-NEXT:    br %r14
   %product = fmul contract <4 x float> %a, %b
   %result = fadd contract <4 x float> %product, %c
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-03.ll b/llvm/test/CodeGen/SystemZ/int-conv-03.ll
index 41f2f87186a5ef..38cc37e853b8c2 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-03.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-03.ll
@@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) {
 ; to use LGB if possible.
 define void @f10(ptr %ptr) {
 ; CHECK-LABEL: f10:
-; CHECK: lgb {{%r[0-9]+}}, 199(%r15)
+; CHECK: lgb {{%r[0-9]+}}, 183(%r15)
 ; CHECK: br %r14
   %val0 = load volatile i64, ptr %ptr
   %val1 = load volatile i64, ptr %ptr
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-04.ll b/llvm/test/CodeGen/SystemZ/int-conv-04.ll
index 5c808920ff25e7..c35cebd77ecb27 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-04.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-04.ll
@@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) {
 ; to use LLGC if possible.
 define void @f11(ptr %ptr) {
 ; CHECK-LABEL: f11:
-; CHECK: llgc {{%r[0-9]+}}, 199(%r15)
+; CHECK: llgc {{%r[0-9]+}}, 183(%r15)
 ; CHECK: br %r14
   %val0 = load volatile i64, ptr %ptr
   %val1 = load volatile i64, ptr %ptr
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-07.ll b/llvm/test/CodeGen/SystemZ/int-conv-07.ll
index bc2895da2cde0a..69de6ffc261191 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-07.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-07.ll
@@ -108,7 +108,7 @@ define i64 @f9(i64 %src, i64 %index) {
 ; to use LGH if possible.
 define void @f10(ptr %ptr) {
 ; CHECK-LABEL: f10:
-; CHECK: lgh {{%r[0-9]+}}, 198(%r15)
+; CHECK: lgh {{%r[0-9]+}}, 182(%r15)
 ; CHECK: br %r14
   %val0 = load volatile i64, ptr %ptr
   %val1 = load volatile i64, ptr %ptr
diff --git a/llvm/test/CodeGen/SystemZ/int-conv-08.ll b/llvm/test/CodeGen/SystemZ/int-conv-08.ll
index 82f2bcea4af780..aa43f80225fa45 100644
--- a/llvm/test/CodeGen/SystemZ/int-conv-08.ll
+++ b/llvm/test/CodeGen/SystemZ/int-conv-08.ll
@@ -117,7 +117,7 @@ define i64 @f10(i64 %src, i64 %index) {
 ; to use LLGH if possible.
 define void @f11(ptr %ptr) {
 ; CHECK-LABEL: f11:
-; CHECK: llgh {{%r[0-9]+}}, 198(%r15)
+; CHECK: llgh {{%r[0-9]+}}, 182(%r15)
 ; CHECK: br %r14
   %val0 = load volatile i64, ptr %ptr
   %val1 = load volatile i64, ptr %ptr
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
index c5f61b7fcdde55..431f5f7455dd35 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
@@ -231,7 +231,7 @@ define arm_aapcs_vfpcc <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b,
 ; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    vmov.f32 s24, s9
 ; CHECK-NEXT:    vmov.f32 s16, s12
-; CHECK-NEXT:    vstrw.32 q6, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s12, s8
 ; CHECK-NEXT:    vldr s27, [sp, #184]
 ; CHECK-NEXT:    vmov.f32 s17, s14
@@ -244,32 +244,33 @@ define arm_aapcs_vfpcc <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b,
 ; CHECK-NEXT:    vneg.f32 q0, q0
 ; CHECK-NEXT:    vldr s24, [sp, #160]
 ; CHECK-NEXT:    vfma.f32 q1, q5, q2
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vsub.f32 q6, q6, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vldr s13, [sp, #156]
-; CHECK-NEXT:    vfma.f32 q1, q4, q2
 ; CHECK-NEXT:    vldr s12, [sp, #148]
+; CHECK-NEXT:    vfma.f32 q1, q4, q2
 ; CHECK-NEXT:    vadd.f32 q1, q7, q1
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vldr s1, [sp, #152]
-; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmul.f32 q2, q3, q7
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vldr s0, [sp, #144]
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vneg.f32 q2, q2
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vldr s21, [sp, #200]
-; CHECK-NEXT:    vfma.f32 q2, q0, q3
-; CHECK-NEXT:    vmul.f32 q0, q0, q7
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmul.f32 q2, q2, q7
 ; CHECK-NEXT:    vldr s20, [sp, #192]
+; CHECK-NEXT:    vneg.f32 q2, q2
 ; CHECK-NEXT:    vldr s17, [sp, #204]
+; CHECK-NEXT:    vfma.f32 q2, q0, q3
+; CHECK-NEXT:    vmul.f32 q0, q0, q7
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vldr s16, [sp, #196]
 ; CHECK-NEXT:    vfma.f32 q0, q7, q3
+; CHECK-NEXT:    vadd.f32 q4, q4, q2
 ; CHECK-NEXT:    vsub.f32 q3, q5, q0
 ; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vadd.f32 q4, q4, q2
 ; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s0, s24
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 85317e1fe4626a..64bd23f9edc533 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -139,105 +139,104 @@ define void @vst3_v16i32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    push {r4, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #160
-; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    .pad #144
+; CHECK-NEXT:    sub sp, #144
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
 ; CHECK-NEXT:    vmov r12, r3, d10
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #176]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
 ; CHECK-NEXT:    vmov.f32 s8, s1
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
 ; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #112]
+; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s20, s22
 ; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s22, s3
+; CHECK-NEXT:    vstrw.32 q3, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s21, s7
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s20, s22
-; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s9, s0
+; CHECK-NEXT:    vmov.f32 s11, s5
 ; CHECK-NEXT:    vmov.f32 s0, s30
+; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s3, s31
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s1, s15
 ; CHECK-NEXT:    vmov.f32 s2, s19
-; CHECK-NEXT:    vmov.f32 s3, s31
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s8, s4
-; CHECK-NEXT:    vmov.f32 s11, s5
+; CHECK-NEXT:    vmov.f32 s20, s22
+; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s5, s16
+; CHECK-NEXT:    vmov.f32 s7, s13
+; CHECK-NEXT:    vmov.f32 s22, s31
+; CHECK-NEXT:    vmov.f32 s16, s29
+; CHECK-NEXT:    vmov.f32 s19, s30
+; CHECK-NEXT:    vmov.f32 s13, s28
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s3, s18
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s4, s12
+; CHECK-NEXT:    vstrw.32 q5, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s12, s8
+; CHECK-NEXT:    vmov.f32 s15, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s9, s28
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s0, s17
-; CHECK-NEXT:    vstrw.32 q2, [sp, #128] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s2, s14
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s3, s18
-; CHECK-NEXT:    vmov.f32 s21, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s21, s7
-; CHECK-NEXT:    vmov.f32 s20, s2
-; CHECK-NEXT:    vmov.f32 s23, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s21, s16
-; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f32 s19, s10
-; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.f64 d14, d4
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s5, s27
-; CHECK-NEXT:    vmov.f32 s8, s24
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s24, s1
-; CHECK-NEXT:    vmov.f32 s27, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov r0, r3, d14
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmov.f32 s1, s27
+; CHECK-NEXT:    vmov.f32 s8, s24
+; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
 ; CHECK-NEXT:    vmov.f32 s11, s25
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    vmov.32 q6[1], r3
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
-; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vmov r0, lr, d14
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q0[1], lr
-; CHECK-NEXT:    vmov.32 q5[2], r0
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
 ; CHECK-NEXT:    vmov r2, r4, d14
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #128] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s29
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #128] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q3[2], r2
 ; CHECK-NEXT:    vmov.32 q4[1], r4
+; CHECK-NEXT:    vmov.f32 s27, s30
+; CHECK-NEXT:    vmov.32 q1[2], r0
+; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q5, [r1, #144]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #144]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    add sp, #144
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, pc}
 entry:
@@ -413,23 +412,21 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmov.u16 r2, q1[7]
-; CHECK-NEXT:    vmov.f64 d12, d4
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s26, s10
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.u16 r2, q1[7]
+; CHECK-NEXT:    vmov q6, q1
 ; CHECK-NEXT:    vmov.16 q3[6], r2
 ; CHECK-NEXT:    vmovx.f16 s0, s10
 ; CHECK-NEXT:    vins.f16 s12, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vins.f16 s14, s0
-; CHECK-NEXT:    vmov.f32 s20, s7
 ; CHECK-NEXT:    vmov q0, q3
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s20, s7
 ; CHECK-NEXT:    vmov.u16 r2, q3[5]
 ; CHECK-NEXT:    vins.f16 s20, s15
 ; CHECK-NEXT:    vmov.16 q4[0], r2
@@ -443,25 +440,28 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmovx.f16 s7, s18
 ; CHECK-NEXT:    vins.f16 s31, s7
 ; CHECK-NEXT:    vmovx.f16 s7, s11
-; CHECK-NEXT:    vins.f16 s3, s7
 ; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s20, s24
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s3, s7
 ; CHECK-NEXT:    vmovx.f16 s11, s8
-; CHECK-NEXT:    vmov.f32 s7, s25
-; CHECK-NEXT:    vins.f16 s20, s0
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov q0, q6
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.u16 r0, q0[1]
-; CHECK-NEXT:    vins.f16 s7, s1
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s7, s25
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vins.f16 s7, s1
 ; CHECK-NEXT:    vmov.f32 s23, s7
 ; CHECK-NEXT:    vmovx.f16 s7, s24
 ; CHECK-NEXT:    vmov.f32 s24, s4
 ; CHECK-NEXT:    vins.f16 s8, s7
-; CHECK-NEXT:    vins.f16 s24, s12
 ; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vins.f16 s24, s12
 ; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
 ; CHECK-NEXT:    vmov.16 q6[4], r0
 ; CHECK-NEXT:    vins.f16 s8, s13
 ; CHECK-NEXT:    vmovx.f16 s4, s4
@@ -470,26 +470,26 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vins.f16 s28, s4
 ; CHECK-NEXT:    vmov.f32 s4, s6
 ; CHECK-NEXT:    vmov.u16 r0, q3[3]
-; CHECK-NEXT:    vins.f16 s4, s14
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vins.f16 s4, s14
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s29
 ; CHECK-NEXT:    vins.f16 s1, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s0, s5
 ; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vins.f16 s29, s0
+; CHECK-NEXT:    vins.f16 s26, s8
 ; CHECK-NEXT:    vmov.f32 s0, s29
-; CHECK-NEXT:    vins.f16 s22, s11
 ; CHECK-NEXT:    vmov.f32 s3, s30
-; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vins.f16 s22, s11
 ; CHECK-NEXT:    vmov.f32 s29, s5
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s30, s6
+; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s18, s31
 ; CHECK-NEXT:    vmov.u16 r0, q1[3]
 ; CHECK-NEXT:    vins.f16 s8, s6
@@ -506,7 +506,7 @@ define void @vst3_v16i16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    add sp, #48
@@ -1012,35 +1012,36 @@ define void @vst3_v8f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    .pad #32
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s1, s15
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s1, s27
+; CHECK-NEXT:    vmov.f32 s2, s15
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s0, s12
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.f32 s3, s13
-; CHECK-NEXT:    vmov.f32 s2, s24
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s0, s24
+; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s3, s25
+; CHECK-NEXT:    vmov.f32 s1, s12
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s20, s4
 ; CHECK-NEXT:    vmov.f32 s23, s5
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s12, s9
-; CHECK-NEXT:    vmov.f32 s15, s10
-; CHECK-NEXT:    vmov.f32 s13, s25
+; CHECK-NEXT:    vmov.f32 s25, s9
 ; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s21, s16
 ; CHECK-NEXT:    vmov.f32 s22, s28
-; CHECK-NEXT:    vmov.f32 s8, s30
+; CHECK-NEXT:    vmov.f32 s24, s13
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s27, s14
+; CHECK-NEXT:    vmov.f32 s8, s30
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vmov.f32 s11, s31
 ; CHECK-NEXT:    vmov.f32 s5, s29
@@ -1069,98 +1070,100 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #128
-; CHECK-NEXT:    sub sp, #128
+; CHECK-NEXT:    .pad #112
+; CHECK-NEXT:    sub sp, #112
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #176]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
+; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
 ; CHECK-NEXT:    vmov.f32 s24, s9
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s26, s6
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s27, s10
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s25, s1
-; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s24, s2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vmov.f32 s4, s16
-; CHECK-NEXT:    vmov.f32 s5, s28
-; CHECK-NEXT:    vmov.f32 s7, s17
-; CHECK-NEXT:    vmov.f32 s1, s19
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s2, s31
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s26, s11
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s15, s30
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s30, s0
-; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vmov.f32 s1, s11
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s18, s10
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s28, s8
-; CHECK-NEXT:    vmov.f32 s31, s9
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s12, s29
-; CHECK-NEXT:    vmov.f32 s29, s4
-; CHECK-NEXT:    vstrw.32 q3, [r1, #160]
-; CHECK-NEXT:    vmov.f32 s16, s5
+; CHECK-NEXT:    vmov.f32 s24, s4
+; CHECK-NEXT:    vmov.f32 s27, s5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s8
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s26, s0
+; CHECK-NEXT:    vstrw.32 q6, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s26, s4
+; CHECK-NEXT:    vmov.f32 s1, s5
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s19
+; CHECK-NEXT:    vmov.f32 s6, s31
+; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s16
+; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vmov.f32 s3, s30
+; CHECK-NEXT:    vmov.f32 s30, s4
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.f32 s29, s8
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmov.f32 s25, s28
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vmov.f32 s27, s17
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s17, s5
+; CHECK-NEXT:    vstrw.32 q6, [r1, #144]
+; CHECK-NEXT:    vmov.f32 s18, s22
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vmov.f32 s5, s23
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s28, s20
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s31, s21
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s4, s20
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s19, s6
-; CHECK-NEXT:    vmov.f32 s4, s8
+; CHECK-NEXT:    vmov.f32 s7, s21
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vmov.f32 s6, s20
-; CHECK-NEXT:    vmov.f32 s20, s22
-; CHECK-NEXT:    vmov.f32 s5, s0
-; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s7, s9
-; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s9, s21
+; CHECK-NEXT:    vmov.f32 s10, s22
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov.f32 s11, s22
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s8, s9
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s21, s27
+; CHECK-NEXT:    vmov.f32 s9, s13
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s13, s23
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s12, s14
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q5, [r1, #80]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #128
+; CHECK-NEXT:    vmov.f32 s14, s23
+; CHECK-NEXT:    vstrw.32 q1, [r1, #128]
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    add sp, #112
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1334,7 +1337,7 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vmov.f32 s8, s12
 ; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vmov.f32 s0, s13
@@ -1348,60 +1351,60 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmov.f32 s12, s8
 ; CHECK-NEXT:    vmov.f64 d11, d9
 ; CHECK-NEXT:    vmov.f32 s21, s17
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vmov.f64 d7, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmovx.f16 s2, s8
 ; CHECK-NEXT:    vins.f16 s8, s0
 ; CHECK-NEXT:    vins.f16 s14, s2
-; CHECK-NEXT:    vmovx.f16 s2, s24
+; CHECK-NEXT:    vmovx.f16 s2, s28
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.f32 s12, s16
+; CHECK-NEXT:    vmov.f32 s0, s17
+; CHECK-NEXT:    vins.f16 s12, s28
 ; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vins.f16 s0, s29
+; CHECK-NEXT:    vmov.16 q3[4], r2
+; CHECK-NEXT:    vmov.f32 s4, s23
+; CHECK-NEXT:    vmov.f32 s15, s0
+; CHECK-NEXT:    vmovx.f16 s0, s24
 ; CHECK-NEXT:    vmov.f32 s16, s12
-; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vins.f16 s16, s24
-; CHECK-NEXT:    vmov.16 q4[4], r2
-; CHECK-NEXT:    vins.f16 s0, s25
-; CHECK-NEXT:    vmov.f32 s19, s0
-; CHECK-NEXT:    vmovx.f16 s0, s12
-; CHECK-NEXT:    vmov.f64 d15, d13
-; CHECK-NEXT:    vmov.f32 s17, s13
-; CHECK-NEXT:    vmov.f32 s24, s16
-; CHECK-NEXT:    vmov.f64 d13, d9
+; CHECK-NEXT:    vins.f16 s4, s7
 ; CHECK-NEXT:    vmov.f64 d9, d7
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
 ; CHECK-NEXT:    vmovx.f16 s2, s12
 ; CHECK-NEXT:    vins.f16 s12, s0
-; CHECK-NEXT:    vins.f16 s26, s2
+; CHECK-NEXT:    vins.f16 s18, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s30
-; CHECK-NEXT:    vmov.f32 s0, s19
-; CHECK-NEXT:    vstrw.32 q6, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s0, s27
+; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    vins.f16 s0, s31
-; CHECK-NEXT:    vmov.f32 s29, s25
+; CHECK-NEXT:    vmov.f64 d9, d13
+; CHECK-NEXT:    vmovx.f16 s2, s15
+; CHECK-NEXT:    vmov.f32 s17, s25
 ; CHECK-NEXT:    vmov.16 q6[0], r0
 ; CHECK-NEXT:    vmov.f32 s25, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s31
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s14
 ; CHECK-NEXT:    vmov.16 q6[6], r0
-; CHECK-NEXT:    vmovx.f16 s2, s15
+; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vins.f16 s24, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s19
 ; CHECK-NEXT:    vins.f16 s15, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vmov.f32 s4, s23
-; CHECK-NEXT:    vins.f16 s27, s2
 ; CHECK-NEXT:    vmov r0, s0
-; CHECK-NEXT:    vins.f16 s4, s7
+; CHECK-NEXT:    vins.f16 s27, s2
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s7
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s10
 ; CHECK-NEXT:    vmov.16 q0[6], r0
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s17, s12
 ; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vmovx.f16 s2, s23
@@ -1409,32 +1412,29 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmovx.f16 s4, s5
 ; CHECK-NEXT:    vins.f16 s11, s2
 ; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vstrw.32 q4, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vins.f16 s2, s6
 ; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.f32 s29, s12
-; CHECK-NEXT:    vmovx.f16 s4, s21
 ; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vmovx.f16 s4, s21
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vins.f16 s9, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s22
-; CHECK-NEXT:    vins.f16 s10, s4
-; CHECK-NEXT:    vmov.f32 s21, s17
-; CHECK-NEXT:    vmov.f32 s22, s18
 ; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmovx.f16 s12, s29
+; CHECK-NEXT:    vins.f16 s10, s4
 ; CHECK-NEXT:    vmov.f32 s4, s18
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmovx.f16 s12, s17
-; CHECK-NEXT:    vins.f16 s4, s18
 ; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s13
+; CHECK-NEXT:    vins.f16 s4, s30
 ; CHECK-NEXT:    vmov.16 q7[2], r0
-; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmov.f32 s6, s2
 ; CHECK-NEXT:    vmov.f32 s30, s4
-; CHECK-NEXT:    vmovx.f16 s4, s21
+; CHECK-NEXT:    vmovx.f16 s4, s17
+; CHECK-NEXT:    vmovx.f16 s12, s13
 ; CHECK-NEXT:    vins.f16 s13, s4
-; CHECK-NEXT:    vmovx.f16 s4, s22
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmovx.f16 s4, s18
 ; CHECK-NEXT:    vins.f16 s14, s4
 ; CHECK-NEXT:    vldrw.u32 q5, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index b36904495e878d..a3173673145ebf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -722,45 +722,41 @@ define void @vst4_v4i64(ptr %src, ptr %dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d15, d10
+; CHECK-NEXT:    vstrw.32 q5, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d11, d0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d14, d12
-; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d14, d4
-; CHECK-NEXT:    vmov.f64 d15, d2
-; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d4, d0
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d10, d13
+; CHECK-NEXT:    vmov.f64 d10, d12
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d13
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov.f64 d11, d2
 ; CHECK-NEXT:    vmov.f64 d2, d5
-; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
 ; CHECK-NEXT:    vmov.f64 d5, d6
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
-; CHECK-NEXT:    vmov.f64 d13, d8
+; CHECK-NEXT:    vmov.f64 d4, d14
+; CHECK-NEXT:    vmov.f64 d6, d15
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
-; CHECK-NEXT:    vmov.f64 d12, d0
-; CHECK-NEXT:    vmov.f64 d8, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d13, d8
+; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
+; CHECK-NEXT:    vmov.f64 d8, d15
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d6, d15
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1269,45 +1265,41 @@ define void @vst4_v4f64(ptr %src, ptr %dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d15, d10
+; CHECK-NEXT:    vstrw.32 q5, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d11, d0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d14, d12
-; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d14, d4
-; CHECK-NEXT:    vmov.f64 d15, d2
-; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d4, d0
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d10, d13
+; CHECK-NEXT:    vmov.f64 d10, d12
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d13
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d10, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vmov.f64 d11, d2
 ; CHECK-NEXT:    vmov.f64 d2, d5
-; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q5, [r1, #16]
 ; CHECK-NEXT:    vmov.f64 d5, d6
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
-; CHECK-NEXT:    vmov.f64 d13, d8
+; CHECK-NEXT:    vmov.f64 d4, d14
+; CHECK-NEXT:    vmov.f64 d6, d15
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
-; CHECK-NEXT:    vmov.f64 d12, d0
-; CHECK-NEXT:    vmov.f64 d8, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d13, d8
+; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
+; CHECK-NEXT:    vmov.f64 d8, d15
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d6, d15
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index 6e22d855dc8315..9a9bf9177de7c2 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -643,27 +643,28 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    cmovll %ebp, %ecx
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    cmovll %esi, %ebp
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovll %esi, %edx
 ; X86-NEXT:    cmpl %esi, %eax
 ; X86-NEXT:    movl %ebx, %esi
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 6bda99c89a37e3..0c742ab416b660 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -625,27 +625,28 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovbl %edx, %ecx
+; X86-NEXT:    cmovbl %ebp, %ecx
 ; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    cmovbl %esi, %ebp
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    cmovbl %esi, %edx
 ; X86-NEXT:    cmpl %esi, %eax
 ; X86-NEXT:    movl %ebx, %esi
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
index cc4360317db7db..c3b0be23c0e42f 100644
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
@@ -130,7 +130,7 @@ define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
 define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-LABEL: test_amx_spill:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $6088, %rsp # imm = 0x17C8
+; CHECK-NEXT:    subq $8136, %rsp # imm = 0x1FC8
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
@@ -148,17 +148,27 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    movw $8, %ax
 ; CHECK-NEXT:    tileloadd (%rsi,%rdx), %tmm0
 ; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
 ; CHECK-NEXT:    movabsq $64, %rcx
-; CHECK-NEXT:    tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT:    tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT:    tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    tilestored %tmm4, -64(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    tilestored %tmm5, 960(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
+; CHECK-NEXT:    tilestored %tmm4, 6080(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    tilestored %tmm5, 7104(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    t2rpntlvwz1 (%rsi,%rdx), %tmm4
+; CHECK-NEXT:    tilestored %tmm4, 4032(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    tilestored %tmm5, 5056(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
+; CHECK-NEXT:    tilestored %tmm4, 1984(%rsp,%rcx) # 1024-byte Folded Spill
+; CHECK-NEXT:    tilestored %tmm5, 3008(%rsp,%rcx) # 1024-byte Folded Spill
 ; CHECK-NEXT:    t2rpntlvwz0 (%rsi,%rdx), %tmm6
+; CHECK-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
+; CHECK-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
+; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
+; CHECK-NEXT:    tileloadd 6080(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
+; CHECK-NEXT:    tileloadd 7104(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
 ; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
 ; CHECK-NEXT:    tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
@@ -169,13 +179,9 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
 ; CHECK-NEXT:    tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
 ; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT:    tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT:    tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT:    tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT:    tilestored %tmm5, (%rsi,%rdx)
 ; CHECK-NEXT:    tilestored %tmm6, (%rsi,%rdx)
 ; CHECK-NEXT:    tilestored %tmm7, (%rsi,%rdx)
-; CHECK-NEXT:    addq $6088, %rsp # imm = 0x17C8
+; CHECK-NEXT:    addq $8136, %rsp # imm = 0x1FC8
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index a4d15a1b21d6b4..dab7785ae2cc03 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -11,66 +11,65 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    pushq %r13
 ; EGPR-NEXT:    pushq %r12
 ; EGPR-NEXT:    pushq %rbx
-; EGPR-NEXT:    subq $104, %rsp
+; EGPR-NEXT:    subq $88, %rsp
 ; EGPR-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %rdi, %r24
 ; EGPR-NEXT:    movq (%rdi), %r13
 ; EGPR-NEXT:    movq 8(%rdi), %r18
-; EGPR-NEXT:    movq 24(%rdi), %r29
+; EGPR-NEXT:    movq 24(%rdi), %r11
 ; EGPR-NEXT:    movq 16(%rdi), %r17
-; EGPR-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NEXT:    movq 32(%r24), %r10
-; EGPR-NEXT:    movq 56(%r24), %r15
-; EGPR-NEXT:    movq 48(%r24), %r12
+; EGPR-NEXT:    movq 40(%rdi), %r10
+; EGPR-NEXT:    movq 32(%rdi), %r26
+; EGPR-NEXT:    movq 56(%rdi), %r14
+; EGPR-NEXT:    movq 48(%rdi), %r15
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq 24(%rsi), %r23
-; EGPR-NEXT:    movq 16(%rsi), %r11
+; EGPR-NEXT:    movq 16(%rsi), %r29
 ; EGPR-NEXT:    movq (%rsi), %r27
-; EGPR-NEXT:    movq 8(%rsi), %r14
-; EGPR-NEXT:    movq %r12, %rax
+; EGPR-NEXT:    movq 8(%rsi), %r24
+; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    movq %r14, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r20
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r16, %r8
 ; EGPR-NEXT:    adcq %r9, %r20
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r20, %r16
 ; EGPR-NEXT:    adcq %rcx, %r9
-; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    movq %r26, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r20
 ; EGPR-NEXT:    movq %rax, %r25
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %r10, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r21
 ; EGPR-NEXT:    movq %rax, %r22
 ; EGPR-NEXT:    addq %r20, %r22
 ; EGPR-NEXT:    adcq $0, %r21
-; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %r26, %rax
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r20
 ; EGPR-NEXT:    movq %rax, %r28
 ; EGPR-NEXT:    addq %r22, %r28
 ; EGPR-NEXT:    adcq %r21, %r20
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r21
 ; EGPR-NEXT:    movq %rax, %r22
 ; EGPR-NEXT:    addq %r20, %r22
@@ -79,19 +78,19 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r8, %r21
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %r26, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r26, %rax
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %r19
 ; EGPR-NEXT:    movq %rax, %r20
 ; EGPR-NEXT:    addq %r8, %r20
 ; EGPR-NEXT:    adcq $0, %r19
-; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    movq %r26, %rax
 ; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %rbx
 ; EGPR-NEXT:    movq %rax, %r31
@@ -99,7 +98,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r19, %rbx
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %r10, %rax
 ; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r26
 ; EGPR-NEXT:    movq %rax, %r8
@@ -113,27 +112,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r9, %r26
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rsi
 ; EGPR-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    mulq %r29
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r21
 ; EGPR-NEXT:    addq %r9, %r21
 ; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %r12, %rax
+; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rdi
-; EGPR-NEXT:    addq %r21, %rdi
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r21, %r20
 ; EGPR-NEXT:    adcq %r16, %r9
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %r10d
-; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    movq %r14, %rax
 ; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r21
 ; EGPR-NEXT:    movq %rax, %r22
@@ -141,30 +140,29 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r10, %r21
 ; EGPR-NEXT:    addq %r8, %rsi
 ; EGPR-NEXT:    movq %rsi, %r19
-; EGPR-NEXT:    adcq %r26, %rdi
+; EGPR-NEXT:    adcq %r26, %r20
 ; EGPR-NEXT:    adcq %rcx, %r22
 ; EGPR-NEXT:    adcq $0, %r21
 ; EGPR-NEXT:    movq %r17, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r26
 ; EGPR-NEXT:    addq %r16, %r26
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %r14, %rsi
+; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
@@ -180,15 +178,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r8, %r15
 ; EGPR-NEXT:    adcq $0, %r14
 ; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %rsi
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r12
 ; EGPR-NEXT:    addq %r15, %rax
 ; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r14, %r12
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %rsi
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r15
 ; EGPR-NEXT:    addq %r12, %r15
@@ -199,11 +196,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %r26
 ; EGPR-NEXT:    movq %rax, %rsi
 ; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %rbx
 ; EGPR-NEXT:    movq %rax, %r14
 ; EGPR-NEXT:    addq %r26, %r14
@@ -232,12 +229,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r9, %r14
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    mulq %r29
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
@@ -249,7 +245,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r16, %r15
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r9b
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r12
 ; EGPR-NEXT:    movq %rax, %rbp
@@ -269,14 +265,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r31, %r12
 ; EGPR-NEXT:    adcq $0, %r19
 ; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %rdi
+; EGPR-NEXT:    adcq $0, %r20
 ; EGPR-NEXT:    adcq $0, %r22
 ; EGPR-NEXT:    adcq $0, %r21
 ; EGPR-NEXT:    movq %r17, %rax
 ; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -290,7 +286,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r16, %r26
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r10b
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -300,7 +296,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r13, %rax
 ; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r19
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r30
@@ -308,10 +304,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r8, %r31
 ; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r31, %r20
+; EGPR-NEXT:    addq %r31, %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r30, %r8
 ; EGPR-NEXT:    setb %r10b
 ; EGPR-NEXT:    movq %r18, %rax
@@ -326,11 +323,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq 48(%rsi), %r28
-; EGPR-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r13, %rax
 ; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r11
+; EGPR-NEXT:    movq %rax, %r19
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    mulq %r28
@@ -343,18 +339,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r13
 ; EGPR-NEXT:    addq %r14, %rax
-; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    movq %rax, %rsi
 ; EGPR-NEXT:    adcq %r26, %r13
-; EGPR-NEXT:    setb %sil
+; EGPR-NEXT:    setb %r14b
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r26
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r13, %r8
-; EGPR-NEXT:    movzbl %sil, %eax
+; EGPR-NEXT:    movzbl %r14b, %eax
 ; EGPR-NEXT:    adcq %rax, %r26
-; EGPR-NEXT:    addq %r31, %r11
-; EGPR-NEXT:    adcq %r30, %r14
+; EGPR-NEXT:    addq %r31, %r19
+; EGPR-NEXT:    adcq %r30, %rsi
 ; EGPR-NEXT:    adcq $0, %r8
 ; EGPR-NEXT:    adcq $0, %r26
 ; EGPR-NEXT:    addq %r16, %r8
@@ -365,8 +361,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    movq %r29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r31
@@ -375,11 +371,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r17, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r17
-; EGPR-NEXT:    addq %r31, %r17
+; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    addq %r31, %r14
 ; EGPR-NEXT:    adcq %r16, %r9
 ; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r13
 ; EGPR-NEXT:    movq %rax, %r31
@@ -387,24 +383,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movzbl %r16b, %eax
 ; EGPR-NEXT:    adcq %rax, %r13
 ; EGPR-NEXT:    addq %r8, %r30
-; EGPR-NEXT:    adcq %r26, %r17
+; EGPR-NEXT:    adcq %r26, %r14
 ; EGPR-NEXT:    movzbl %r18b, %eax
 ; EGPR-NEXT:    adcq %rax, %r31
 ; EGPR-NEXT:    adcq $0, %r13
-; EGPR-NEXT:    addq %rbx, %r19
+; EGPR-NEXT:    addq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; EGPR-NEXT:    adcq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; EGPR-NEXT:    adcq %rbp, %r19
 ; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r15, %r20
-; EGPR-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %rbp, %r11
-; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r12, %r14
-; EGPR-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r12, %rsi
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq $0, %r30
-; EGPR-NEXT:    adcq $0, %r17
+; EGPR-NEXT:    adcq $0, %r14
 ; EGPR-NEXT:    adcq $0, %r31
 ; EGPR-NEXT:    adcq $0, %r13
 ; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq %rdi, %r17
+; EGPR-NEXT:    adcq %r20, %r14
 ; EGPR-NEXT:    adcq %r22, %r31
 ; EGPR-NEXT:    adcq %r21, %r13
 ; EGPR-NEXT:    setb %r15b
@@ -421,7 +415,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    movq %rsi, %r29
+; EGPR-NEXT:    movq %rsi, %r12
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r20
@@ -429,7 +423,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r18b
 ; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    movq %r21, %r14
+; EGPR-NEXT:    movq %r21, %rbp
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -440,7 +434,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %rbx, %rax
 ; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %rdi
+; EGPR-NEXT:    movq %rax, %r11
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; EGPR-NEXT:    movq %rsi, %rax
 ; EGPR-NEXT:    mulq %r25
@@ -452,7 +446,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    addq %r22, %rax
-; EGPR-NEXT:    movq %rax, %r11
+; EGPR-NEXT:    movq %rax, %r17
 ; EGPR-NEXT:    adcq %r21, %r8
 ; EGPR-NEXT:    setb %r18b
 ; EGPR-NEXT:    movq %rsi, %rax
@@ -498,24 +492,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r16, %r8
 ; EGPR-NEXT:    adcq %r9, %r21
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r12, %rax
 ; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    movq %rbp, %rax
 ; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r19
 ; EGPR-NEXT:    addq %r9, %r19
 ; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r12, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    addq %r19, %rax
 ; EGPR-NEXT:    movq %rax, %r19
 ; EGPR-NEXT:    adcq %r16, %r9
 ; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    movq %rbp, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %rbp
 ; EGPR-NEXT:    movq %rax, %r12
@@ -527,10 +521,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movzbl %r18b, %eax
 ; EGPR-NEXT:    adcq %rax, %r12
 ; EGPR-NEXT:    adcq $0, %rbp
-; EGPR-NEXT:    addq %r30, %rdi
-; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r17, %r11
+; EGPR-NEXT:    addq %r30, %r11
 ; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r14, %r17
+; EGPR-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r31, %rsi
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r13, %r20
@@ -542,9 +536,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq $0, %r12
 ; EGPR-NEXT:    adcq $0, %rbp
-; EGPR-NEXT:    movq 64(%r24), %r21
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq 64(%rdi), %r21
+; EGPR-NEXT:    movq %r29, %rax
 ; EGPR-NEXT:    mulq %r21
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r22
@@ -554,8 +547,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 72(%r24), %r30
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq 72(%rdi), %r30
+; EGPR-NEXT:    movq %r29, %rax
 ; EGPR-NEXT:    mulq %r30
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r26
@@ -573,8 +566,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %r21
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    movq %r24, %rax
 ; EGPR-NEXT:    mulq %r21
 ; EGPR-NEXT:    movq %rdx, %r31
 ; EGPR-NEXT:    movq %rax, %rbx
@@ -587,7 +579,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r31, %r8
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    movq %r24, %rax
 ; EGPR-NEXT:    mulq %r30
 ; EGPR-NEXT:    movq %rdx, %r31
 ; EGPR-NEXT:    movq %rax, %rbx
@@ -598,18 +590,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r26, %r31
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 80(%r24), %r13
+; EGPR-NEXT:    movq 80(%rdi), %r13
 ; EGPR-NEXT:    movq %r27, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    movq %r24, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r26
 ; EGPR-NEXT:    movq %rax, %r14
 ; EGPR-NEXT:    addq %r8, %r14
 ; EGPR-NEXT:    adcq $0, %r26
-; EGPR-NEXT:    movq 88(%r24), %r18
+; EGPR-NEXT:    movq 88(%rdi), %r18
 ; EGPR-NEXT:    movq %r27, %rax
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r15
@@ -617,7 +609,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r14, %r22
 ; EGPR-NEXT:    adcq %r26, %r15
 ; EGPR-NEXT:    setb %r14b
-; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    movq %r24, %rax
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r26
 ; EGPR-NEXT:    movq %rax, %r8
@@ -631,18 +623,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq $0, %r26
 ; EGPR-NEXT:    addq %r16, %r8
 ; EGPR-NEXT:    adcq %r9, %r26
-; EGPR-NEXT:    setb %r31b
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    setb %sil
+; EGPR-NEXT:    movq %r29, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %rax, %r31
 ; EGPR-NEXT:    movq %r23, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r14
 ; EGPR-NEXT:    addq %r9, %r14
 ; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %r29, %rax
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %rbx
@@ -656,9 +648,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r9, %r15
 ; EGPR-NEXT:    movzbl %r16b, %eax
 ; EGPR-NEXT:    adcq %rax, %r14
-; EGPR-NEXT:    addq %r8, %rsi
+; EGPR-NEXT:    addq %r8, %r31
 ; EGPR-NEXT:    adcq %r26, %rbx
-; EGPR-NEXT:    movzbl %r31b, %eax
+; EGPR-NEXT:    movzbl %sil, %eax
 ; EGPR-NEXT:    adcq %rax, %r15
 ; EGPR-NEXT:    adcq $0, %r14
 ; EGPR-NEXT:    imulq %r25, %r18
@@ -668,12 +660,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r18, %rdx
 ; EGPR-NEXT:    imulq %rcx, %r13
 ; EGPR-NEXT:    addq %rdx, %r13
-; EGPR-NEXT:    movq %r28, %r9
-; EGPR-NEXT:    imulq %r30, %r9
+; EGPR-NEXT:    movq %r28, %rsi
+; EGPR-NEXT:    imulq %r30, %rsi
 ; EGPR-NEXT:    movq %r28, %rax
 ; EGPR-NEXT:    mulq %r21
 ; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r9, %rdx
+; EGPR-NEXT:    addq %rsi, %rdx
 ; EGPR-NEXT:    imulq %r21, %r10
 ; EGPR-NEXT:    addq %rdx, %r10
 ; EGPR-NEXT:    addq %r8, %r26
@@ -694,32 +686,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r28, %r16
 ; EGPR-NEXT:    adcq %r25, %r8
-; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    setb %sil
 ; EGPR-NEXT:    movq %r30, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r21
 ; EGPR-NEXT:    movq %rax, %r28
 ; EGPR-NEXT:    addq %r8, %r28
-; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    movzbl %sil, %eax
 ; EGPR-NEXT:    adcq %rax, %r21
 ; EGPR-NEXT:    addq %r26, %r28
 ; EGPR-NEXT:    adcq %r10, %r21
-; EGPR-NEXT:    movq 112(%r24), %rcx
+; EGPR-NEXT:    movq 112(%rdi), %rcx
 ; EGPR-NEXT:    movq %r27, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rax, %r8
-; EGPR-NEXT:    imulq %r11, %rcx
+; EGPR-NEXT:    imulq %r24, %rcx
 ; EGPR-NEXT:    addq %rdx, %rcx
-; EGPR-NEXT:    movq 120(%r24), %rax
+; EGPR-NEXT:    movq 120(%rdi), %rax
 ; EGPR-NEXT:    imulq %r27, %rax
 ; EGPR-NEXT:    addq %rax, %rcx
-; EGPR-NEXT:    movq 96(%r24), %r25
-; EGPR-NEXT:    movq 104(%r24), %r26
-; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    imulq %r26, %rdi
+; EGPR-NEXT:    movq 96(%rdi), %r25
+; EGPR-NEXT:    movq 104(%rdi), %r26
+; EGPR-NEXT:    movq %r29, %rsi
+; EGPR-NEXT:    imulq %r26, %rsi
+; EGPR-NEXT:    movq %r29, %rax
 ; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %rdi, %rdx
+; EGPR-NEXT:    addq %rsi, %rdx
 ; EGPR-NEXT:    imulq %r25, %r23
 ; EGPR-NEXT:    addq %rdx, %r23
 ; EGPR-NEXT:    addq %r8, %r29
@@ -735,14 +728,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r8, %r30
 ; EGPR-NEXT:    adcq $0, %r27
 ; EGPR-NEXT:    movq %r25, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r25
 ; EGPR-NEXT:    addq %r30, %r25
 ; EGPR-NEXT:    adcq %r27, %r8
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r26, %rax
-; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    mulq %r24
 ; EGPR-NEXT:    movq %rdx, %r24
 ; EGPR-NEXT:    movq %rax, %r27
 ; EGPR-NEXT:    addq %r8, %r27
@@ -754,7 +747,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %r16, %r25
 ; EGPR-NEXT:    adcq %r28, %r27
 ; EGPR-NEXT:    adcq %r21, %r24
-; EGPR-NEXT:    addq %rsi, %r20
+; EGPR-NEXT:    addq %r31, %r20
 ; EGPR-NEXT:    adcq %rbx, %r25
 ; EGPR-NEXT:    adcq %r15, %r27
 ; EGPR-NEXT:    adcq %r14, %r24
@@ -1024,7 +1017,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r18, 104(%rcx)
 ; EGPR-NEXT:    movq %rax, 112(%rcx)
 ; EGPR-NEXT:    movq %rdx, 120(%rcx)
-; EGPR-NEXT:    addq $104, %rsp
+; EGPR-NEXT:    addq $88, %rsp
 ; EGPR-NEXT:    popq %rbx
 ; EGPR-NEXT:    popq %r12
 ; EGPR-NEXT:    popq %r13
@@ -1269,103 +1262,102 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    setb %r8b
 ; EGPR-NDD-NEXT:    movq %r9, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %rcx, %rax, %rdi
+; EGPR-NDD-NEXT:    addq %rcx, %rax, %rsi
 ; EGPR-NDD-NEXT:    movzbl %r8b, %eax
-; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
+; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rcx
+; EGPR-NDD-NEXT:    movq %r17, %r10
 ; EGPR-NDD-NEXT:    movq %r17, %rax
 ; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    movq %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %rax, %r25
-; EGPR-NDD-NEXT:    movq %r11, %r10
 ; EGPR-NDD-NEXT:    movq %r11, %rax
 ; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    addq %r28, %rax, %r8
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %r8, %rax, %r23
 ; EGPR-NDD-NEXT:    adcq %rdx, %r28
-; EGPR-NDD-NEXT:    setb %cl
+; EGPR-NDD-NEXT:    setb %dil
 ; EGPR-NDD-NEXT:    movq %r11, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %r28, %rax
-; EGPR-NDD-NEXT:    movzbl %cl, %ecx
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq %rcx, %r29, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rsi, %r9
-; EGPR-NDD-NEXT:    movq 48(%r15), %r11
-; EGPR-NDD-NEXT:    movq %r17, %rsi
-; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movzbl %dil, %edi
+; EGPR-NDD-NEXT:    adcq %rdi, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r27, %rdi
+; EGPR-NDD-NEXT:    adcq %rdx, %r29, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rcx, %r9
+; EGPR-NDD-NEXT:    movq 48(%r15), %r27
 ; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    movq %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %rax, %r29
-; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
 ; EGPR-NDD-NEXT:    movq 56(%r15), %r17
-; EGPR-NDD-NEXT:    movq %rsi, %rax
+; EGPR-NDD-NEXT:    movq %r10, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
 ; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    setb %sil
-; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    setb %r10b
+; EGPR-NDD-NEXT:    movq %r11, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
 ; EGPR-NDD-NEXT:    addq %rcx, %rax
-; EGPR-NDD-NEXT:    movzbl %sil, %ecx
+; EGPR-NDD-NEXT:    movzbl %r10b, %ecx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %r29, %r27
-; EGPR-NDD-NEXT:    adcq %r8, %r28, %r10
+; EGPR-NDD-NEXT:    addq %rdi, %r29, %r11
+; EGPR-NDD-NEXT:    adcq %r28, %r8
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    addq %rax, %rdi
-; EGPR-NDD-NEXT:    adcq %rcx, %r9, %r8
-; EGPR-NDD-NEXT:    setb %sil
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    adcq %r9, %rcx
+; EGPR-NDD-NEXT:    setb %r9b
 ; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    movq %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %rax, %r29
 ; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r19, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
 ; EGPR-NDD-NEXT:    movq %r16, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
 ; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %cl
+; EGPR-NDD-NEXT:    adcq %rdx, %r10
+; EGPR-NDD-NEXT:    setb %dil
 ; EGPR-NDD-NEXT:    movq %r19, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %r9, %rax
-; EGPR-NDD-NEXT:    movzbl %cl, %ecx
-; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %r29, %rdi
-; EGPR-NDD-NEXT:    adcq %r28, %r8
-; EGPR-NDD-NEXT:    movzbl %sil, %edx
-; EGPR-NDD-NEXT:    adcq %rdx, %rax
-; EGPR-NDD-NEXT:    adcq $0, %rcx
+; EGPR-NDD-NEXT:    addq %r10, %rax
+; EGPR-NDD-NEXT:    movzbl %dil, %edi
+; EGPR-NDD-NEXT:    adcq %rdi, %rdx
+; EGPR-NDD-NEXT:    addq %r29, %rsi
+; EGPR-NDD-NEXT:    adcq %r28, %rcx
+; EGPR-NDD-NEXT:    movzbl %r9b, %edi
+; EGPR-NDD-NEXT:    adcq %rdi, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
 ; EGPR-NDD-NEXT:    addq %r12, %r25
 ; EGPR-NDD-NEXT:    movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq %r13, %r23, %r19
 ; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbp, %r27
-; EGPR-NDD-NEXT:    movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbx, %r10
-; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq %rbp, %r11
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %rbx, %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %rsi
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    addq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r8, %r30
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    addq %rsi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rcx, %r30
 ; EGPR-NDD-NEXT:    adcq %rax, %r31
-; EGPR-NDD-NEXT:    adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; EGPR-NDD-NEXT:    setb %r8b
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; EGPR-NDD-NEXT:    movq %r13, %rax
@@ -1376,16 +1368,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %r10, %rax
 ; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    addq %rax, %r25
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
 ; EGPR-NDD-NEXT:    movq %r13, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %r25, %rax, %rdi
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
 ; EGPR-NDD-NEXT:    setb %r9b
 ; EGPR-NDD-NEXT:    movq %r10, %rax
 ; EGPR-NDD-NEXT:    movq %r10, %r16
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    movzbl %r9b, %eax
 ; EGPR-NDD-NEXT:    adcq %rax, %rdx, %r9
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
@@ -1402,47 +1394,47 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %r29, %rax, %rbx
 ; EGPR-NDD-NEXT:    adcq %rdx, %r10
-; EGPR-NDD-NEXT:    setb %r27b
+; EGPR-NDD-NEXT:    setb %r11b
 ; EGPR-NDD-NEXT:    movq %r12, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %r10, %rax
-; EGPR-NDD-NEXT:    movzbl %r27b, %r10d
+; EGPR-NDD-NEXT:    movzbl %r11b, %r10d
 ; EGPR-NDD-NEXT:    adcq %r10, %rdx
 ; EGPR-NDD-NEXT:    addq %rax, %r28, %r10
 ; EGPR-NDD-NEXT:    adcq %rdx, %rdi
-; EGPR-NDD-NEXT:    adcq $0, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rcx
 ; EGPR-NDD-NEXT:    adcq $0, %r9
 ; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    movq %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %rax, %r29
 ; EGPR-NDD-NEXT:    movq %r12, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r27
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r11
 ; EGPR-NDD-NEXT:    movq %r23, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
 ; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq %rdx, %r27
+; EGPR-NDD-NEXT:    adcq %rdx, %r11
 ; EGPR-NDD-NEXT:    setb %bpl
 ; EGPR-NDD-NEXT:    movq %r12, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %r27, %rax
-; EGPR-NDD-NEXT:    movzbl %bpl, %r27d
-; EGPR-NDD-NEXT:    adcq %r27, %rdx
+; EGPR-NDD-NEXT:    addq %r11, %rax
+; EGPR-NDD-NEXT:    movzbl %bpl, %r11d
+; EGPR-NDD-NEXT:    adcq %r11, %rdx
 ; EGPR-NDD-NEXT:    addq %r29, %r10
 ; EGPR-NDD-NEXT:    adcq %r28, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %r27b
+; EGPR-NDD-NEXT:    setb %r11b
 ; EGPR-NDD-NEXT:    movq %r13, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    movq %rdx, %r28
 ; EGPR-NDD-NEXT:    movq %rax, %r29
 ; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    mulq %r27
 ; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
 ; EGPR-NDD-NEXT:    movq %r13, %rax
@@ -1455,10 +1447,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %r12, %rax
 ; EGPR-NDD-NEXT:    movzbl %bpl, %r12d
 ; EGPR-NDD-NEXT:    adcq %r12, %rdx
-; EGPR-NDD-NEXT:    addq %r29, %rsi
+; EGPR-NDD-NEXT:    addq %r29, %rcx
 ; EGPR-NDD-NEXT:    adcq %r28, %r9
-; EGPR-NDD-NEXT:    movzbl %r27b, %r27d
-; EGPR-NDD-NEXT:    adcq %r27, %rax
+; EGPR-NDD-NEXT:    movzbl %r11b, %r11d
+; EGPR-NDD-NEXT:    adcq %r11, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
 ; EGPR-NDD-NEXT:    addq %r25, %r19
 ; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1466,9 +1458,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %r30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq %r31, %r10
 ; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rdi, %rcx
-; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movzbl %r8b, %ecx
+; EGPR-NDD-NEXT:    adcq %rdi, %rsi
+; EGPR-NDD-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movzbl %r8b, %esi
 ; EGPR-NDD-NEXT:    adcq %rsi, %rcx
 ; EGPR-NDD-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq $0, %r9
@@ -1578,8 +1570,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %rbx, %rdx
 ; EGPR-NDD-NEXT:    imulq %r18, %r8
 ; EGPR-NDD-NEXT:    addq %rdx, %r8
-; EGPR-NDD-NEXT:    imulq %r29, %r11, %rcx
-; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    imulq %r29, %r27, %rcx
+; EGPR-NDD-NEXT:    movq %r27, %rax
 ; EGPR-NDD-NEXT:    mulq %r28
 ; EGPR-NDD-NEXT:    addq %rdx, %rcx
 ; EGPR-NDD-NEXT:    imulq %r28, %r17, %r16
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index d92e1a1e7b9d49..6354b7afbe4d3a 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -723,7 +723,8 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431633920, %ebp # imm = 0x55550000
 ; X86-NEXT:    shrl %ebx
 ; X86-NEXT:    andl $1431633920, %ebx # imm = 0x55550000
-; X86-NEXT:    leal (%ebx,%ebp,2), %ebp
+; X86-NEXT:    leal (%ebx,%ebp,2), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
@@ -740,8 +741,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
 ; X86-NEXT:    shrl %edi
 ; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NEXT:    leal (%edi,%ebx,2), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal (%edi,%ebx,2), %ebx
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
@@ -758,7 +758,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
 ; X86-NEXT:    shrl %esi
 ; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    leal (%esi,%edi,2), %ebx
+; X86-NEXT:    leal (%esi,%edi,2), %edi
 ; X86-NEXT:    bswapl %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
@@ -887,7 +887,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edi
+; X86-NEXT:    leal (%eax,%ecx,2), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
@@ -906,7 +906,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
@@ -925,7 +925,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
@@ -1000,8 +1000,7 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal (%eax,%ecx,2), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bswapl %eax
 ; X86-NEXT:    movl %eax, %ecx
@@ -1019,13 +1018,11 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%ecx,2), %edx
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shrdl $16, %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal (%eax,%ecx,2), %ebp
+; X86-NEXT:    shrdl $16, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    shrdl $16, %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shrdl $16, %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1044,32 +1041,30 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shrdl $16, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shrdl $16, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $16, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %eax
+; X86-NEXT:    shrdl $16, %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ebp, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ebx, %ebp
+; X86-NEXT:    shrdl $16, %ebx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shrdl $16, %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shrdl $16, %ecx, %edi
-; X86-NEXT:    shrdl $16, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shrdl $16, %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shrdl $16, %eax, %esi
+; X86-NEXT:    shrdl $16, %edx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shrdl $16, %ebp, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %edi, 56(%eax)
-; X86-NEXT:    movl %ebx, 52(%eax)
-; X86-NEXT:    movl %ebp, 48(%eax)
+; X86-NEXT:    movl %edx, 60(%eax)
+; X86-NEXT:    movl %ecx, 56(%eax)
+; X86-NEXT:    movl %esi, 52(%eax)
+; X86-NEXT:    movl %edi, 48(%eax)
+; X86-NEXT:    movl %ebx, 44(%eax)
 ; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 40(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 36(%eax)
@@ -1089,9 +1084,10 @@ define i528 @large_promotion(i528 %A) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
-; X86-NEXT:    shrl $16, %edx
-; X86-NEXT:    movw %dx, 64(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    shrl $16, %ebp
+; X86-NEXT:    movw %bp, 64(%eax)
 ; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index f66d81c781fe0d..64c9b3e51583ce 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -200,27 +200,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ebx
 ; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    subl %edx, %esi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
@@ -232,92 +231,88 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    bsrl %edi, %edi
-; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    orl $32, %edi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    orl $64, %edi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    bsrl %ebx, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    bsrl %esi, %edi
+; X86-NEXT:    xorl $31, %edi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    orl $32, %edx
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    cmovnel %esi, %edx
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmovnel %edi, %edx
 ; X86-NEXT:    orl $64, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl %edx, %edi
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %edi, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %esi
+; X86-NEXT:    cmovnel %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    cmovnel %esi, %eax
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    jne .LBB4_8
+; X86-NEXT:  # %bb.1: # %_udiv-special-cases
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    jne .LBB4_1
-; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    je .LBB4_9
-; X86-NEXT:  # %bb.5: # %udiv-bb1
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    je .LBB4_8
+; X86-NEXT:  # %bb.2: # %udiv-bb1
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
@@ -328,80 +323,71 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 152(%esp,%eax), %esi
-; X86-NEXT:    movl 156(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%esp,%eax), %edx
+; X86-NEXT:    movl 152(%esp,%eax), %edx
+; X86-NEXT:    movl 156(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl 144(%esp,%eax), %esi
 ; X86-NEXT:    movl 148(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl $1, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    jae .LBB4_2
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_9
-; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:  .LBB4_3: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 108(%esp,%eax), %edx
+; X86-NEXT:    movl 108(%esp,%eax), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%esp,%eax), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shrdl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%eax), %esi
-; X86-NEXT:    movl 100(%esp,%eax), %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    shrdl %cl, %ebx, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl 104(%esp,%eax), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%eax), %edi
+; X86-NEXT:    movl 100(%esp,%eax), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    shrdl %cl, %edx, %esi
+; X86-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shrdl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrdl %cl, %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -411,148 +397,142 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $-1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    .p2align 4
-; X86-NEXT:  .LBB4_3: # %udiv-do-while
+; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shldl $1, %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    shldl $1, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl $1, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    shldl $1, %ecx, %esi
+; X86-NEXT:    shldl $1, %ebx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    shldl $1, %ecx, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, %ecx
+; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %edx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %edx
 ; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    orl %edi, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    shldl $1, %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    shldl $1, %eax, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    shldl $1, %edi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    orl %ecx, %ebx
+; X86-NEXT:    shldl $1, %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:  .LBB4_9: # %udiv-end
+; X86-NEXT:    shldl $1, %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    addl %esi, %esi
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:  .LBB4_8: # %udiv-end
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, %esi
 ; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    subl %ecx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 44(%ebp), %ecx
-; X86-NEXT:    movl %edx, (%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    movl %ebx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %ebx, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl 32(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
@@ -796,10 +776,10 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X64-NEXT:    movzbl %al, %r13d
 ; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movzbl %al, %edx
+; X64-NEXT:    movzbl %al, %ecx
 ; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movd %edi, %xmm3
@@ -823,9 +803,9 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; X64-NEXT:    movd %r13d, %xmm3
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X64-NEXT:    movd %edx, %xmm6
+; X64-NEXT:    movd %ecx, %xmm6
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movzbl %dl, %ecx
 ; X64-NEXT:    movd %ecx, %xmm4
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
 ; X64-NEXT:    movzbl %al, %eax
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 6fdde0b14a9843..613ba58632f66d 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -199,30 +199,30 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 36(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebx, %eax
-; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    bsrl %ebx, %ebx
+; X86-NEXT:    xorl $31, %ebx
+; X86-NEXT:    orl $32, %ebx
 ; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    orl $64, %eax
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    cmovnel %edx, %ebx
+; X86-NEXT:    orl $64, %ebx
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    cmovnel %ecx, %ebx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    bsrl %eax, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 20(%ebp), %ecx
-; X86-NEXT:    bsrl %ecx, %ecx
+; X86-NEXT:    bsrl 20(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    orl $32, %ecx
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl 16(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
@@ -233,51 +233,53 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    orl $64, %edx
-; X86-NEXT:    movl 20(%ebp), %edi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
-; X86-NEXT:    subl %edx, %eax
+; X86-NEXT:    subl %edx, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
-; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmpl %ebx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %esi, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    xorl $127, %eax
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorl $127, %eax
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edi, %edx
 ; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    sete %al
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movb %cl, %ah
 ; X86-NEXT:    movl 24(%ebp), %ebx
-; X86-NEXT:    movl $0, %esi
-; X86-NEXT:    cmovnel %esi, %ebx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovnel %esi, %ecx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    cmovnel %edi, %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    cmovnel %edi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 16(%ebp), %esi
-; X86-NEXT:    cmovnel %edx, %esi
+; X86-NEXT:    cmovnel %edi, %esi
 ; X86-NEXT:    movl 12(%ebp), %edi
 ; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    orb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    jne .LBB4_7
 ; X86-NEXT:  # %bb.1: # %udiv-bb1
@@ -286,12 +288,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorb $127, %cl
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    shrb $3, %al
@@ -310,14 +311,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shldl %cl, %ebx, %edi
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    addl $1, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 20(%ebp), %ebx
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    xorl %edx, %edx
@@ -325,8 +324,8 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    jmp .LBB4_6
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 12(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %edx
@@ -340,13 +339,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 92(%esp,%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esp,%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 88(%esp,%eax), %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -355,10 +356,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -368,7 +374,6 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl 32(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl 40(%ebp), %eax
@@ -478,36 +483,35 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    imull %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 28(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull 28(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull %esi, %ebx
 ; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl 32(%ebp), %edx
-; X86-NEXT:    imull %edx, %esi
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
@@ -749,10 +753,10 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X64-NEXT:    movzbl %al, %r13d
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movzbl %al, %edx
+; X64-NEXT:    movzbl %al, %ecx
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    movl %eax, %edx
 ; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movd %edi, %xmm3
@@ -776,9 +780,9 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; X64-NEXT:    movd %r13d, %xmm3
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X64-NEXT:    movd %edx, %xmm6
+; X64-NEXT:    movd %ecx, %xmm6
 ; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    movzbl %dl, %ecx
 ; X64-NEXT:    movd %ecx, %xmm4
 ; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
 ; X64-NEXT:    movzbl %al, %eax
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 90e075bfabf0a2..7a5cc24f4d6390 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -1421,26 +1421,27 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    pushl %edi
 ; X86-NOBMI-NEXT:    pushl %esi
 ; X86-NOBMI-NEXT:    subl $12, %esp
-; X86-NOBMI-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NOBMI-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOBMI-NEXT:    movl %esi, %ebp
-; X86-NOBMI-NEXT:    movl %eax, %ecx
+; X86-NOBMI-NEXT:    movl %edx, %ebp
+; X86-NOBMI-NEXT:    movb %al, %cl
 ; X86-NOBMI-NEXT:    shrl %cl, %ebp
-; X86-NOBMI-NEXT:    shrdl %cl, %esi, %ebx
+; X86-NOBMI-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-NOBMI-NEXT:    testb $32, %al
 ; X86-NOBMI-NEXT:    je .LBB13_2
 ; X86-NOBMI-NEXT:  # %bb.1:
 ; X86-NOBMI-NEXT:    movl %ebp, %ebx
 ; X86-NOBMI-NEXT:    xorl %ebp, %ebp
 ; X86-NOBMI-NEXT:  .LBB13_2:
+; X86-NOBMI-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOBMI-NEXT:    movl $1, %esi
 ; X86-NOBMI-NEXT:    xorl %edi, %edi
-; X86-NOBMI-NEXT:    movl %edx, %ecx
+; X86-NOBMI-NEXT:    movb %ch, %cl
 ; X86-NOBMI-NEXT:    shldl %cl, %esi, %edi
 ; X86-NOBMI-NEXT:    shll %cl, %esi
-; X86-NOBMI-NEXT:    testb $32, %dl
+; X86-NOBMI-NEXT:    testb $32, %ch
 ; X86-NOBMI-NEXT:    je .LBB13_4
 ; X86-NOBMI-NEXT:  # %bb.3:
 ; X86-NOBMI-NEXT:    movl %esi, %edi
@@ -1451,7 +1452,7 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-NOBMI-NEXT:    andl %ebx, %esi
 ; X86-NOBMI-NEXT:    andl %ebp, %edi
 ; X86-NOBMI-NEXT:    subl $8, %esp
-; X86-NOBMI-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOBMI-NEXT:    pushl %edx
 ; X86-NOBMI-NEXT:    pushl %eax
 ; X86-NOBMI-NEXT:    calll use64 at PLT
 ; X86-NOBMI-NEXT:    addl $16, %esp
@@ -1471,26 +1472,27 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %esi, %ebp
-; X86-BMI1-NEXT:    movl %eax, %ecx
+; X86-BMI1-NEXT:    movl %edx, %ebp
+; X86-BMI1-NEXT:    movb %al, %cl
 ; X86-BMI1-NEXT:    shrl %cl, %ebp
-; X86-BMI1-NEXT:    shrdl %cl, %esi, %ebx
+; X86-BMI1-NEXT:    shrdl %cl, %edx, %ebx
 ; X86-BMI1-NEXT:    testb $32, %al
 ; X86-BMI1-NEXT:    je .LBB13_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %ebp, %ebx
 ; X86-BMI1-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1-NEXT:  .LBB13_2:
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl $1, %esi
 ; X86-BMI1-NEXT:    xorl %edi, %edi
-; X86-BMI1-NEXT:    movl %edx, %ecx
+; X86-BMI1-NEXT:    movb %ch, %cl
 ; X86-BMI1-NEXT:    shldl %cl, %esi, %edi
 ; X86-BMI1-NEXT:    shll %cl, %esi
-; X86-BMI1-NEXT:    testb $32, %dl
+; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB13_4
 ; X86-BMI1-NEXT:  # %bb.3:
 ; X86-BMI1-NEXT:    movl %esi, %edi
@@ -1501,7 +1503,7 @@ define i64 @bextr64_a5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1-NEXT:    andl %ebx, %esi
 ; X86-BMI1-NEXT:    andl %ebp, %edi
 ; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    pushl %edx
 ; X86-BMI1-NEXT:    pushl %eax
 ; X86-BMI1-NEXT:    calll use64 at PLT
 ; X86-BMI1-NEXT:    addl $16, %esp
@@ -3422,34 +3424,35 @@ define i64 @bextr64_b5_skipextrauses(i64 %val, i64 %numskipbits, i64 %numlowbits
 ; X86-BMI1-NEXT:    pushl %edi
 ; X86-BMI1-NEXT:    pushl %esi
 ; X86-BMI1-NEXT:    subl $12, %esp
-; X86-BMI1-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-BMI1-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-BMI1-NEXT:    movl %ebx, %esi
-; X86-BMI1-NEXT:    movl %eax, %ecx
+; X86-BMI1-NEXT:    movl %edx, %esi
+; X86-BMI1-NEXT:    movb %al, %cl
 ; X86-BMI1-NEXT:    shrl %cl, %esi
-; X86-BMI1-NEXT:    shrdl %cl, %ebx, %edi
+; X86-BMI1-NEXT:    shrdl %cl, %edx, %edi
 ; X86-BMI1-NEXT:    testb $32, %al
 ; X86-BMI1-NEXT:    je .LBB30_2
 ; X86-BMI1-NEXT:  # %bb.1:
 ; X86-BMI1-NEXT:    movl %esi, %edi
 ; X86-BMI1-NEXT:    xorl %esi, %esi
 ; X86-BMI1-NEXT:  .LBB30_2:
-; X86-BMI1-NEXT:    movl $-1, %ebx
+; X86-BMI1-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-BMI1-NEXT:    movl $-1, %edx
 ; X86-BMI1-NEXT:    movl $-1, %ebp
-; X86-BMI1-NEXT:    movl %edx, %ecx
+; X86-BMI1-NEXT:    movb %ch, %cl
 ; X86-BMI1-NEXT:    shll %cl, %ebp
-; X86-BMI1-NEXT:    testb $32, %dl
+; X86-BMI1-NEXT:    testb $32, %ch
 ; X86-BMI1-NEXT:    je .LBB30_4
 ; X86-BMI1-NEXT:  # %bb.3:
-; X86-BMI1-NEXT:    movl %ebp, %ebx
+; X86-BMI1-NEXT:    movl %ebp, %edx
 ; X86-BMI1-NEXT:    xorl %ebp, %ebp
 ; X86-BMI1-NEXT:  .LBB30_4:
-; X86-BMI1-NEXT:    andnl %esi, %ebx, %esi
+; X86-BMI1-NEXT:    andnl %esi, %edx, %esi
 ; X86-BMI1-NEXT:    andnl %edi, %ebp, %edi
 ; X86-BMI1-NEXT:    subl $8, %esp
-; X86-BMI1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-BMI1-NEXT:    pushl %ebx
 ; X86-BMI1-NEXT:    pushl %eax
 ; X86-BMI1-NEXT:    calll use64 at PLT
 ; X86-BMI1-NEXT:    addl $16, %esp
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964057588f..88a88e7db53e7d 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -24,45 +24,47 @@ define i64 @fn1() #0 {
 ; CHECK-NEXT:    .cfi_offset %esi, -20
 ; CHECK-NEXT:    .cfi_offset %edi, -16
 ; CHECK-NEXT:    .cfi_offset %ebx, -12
-; CHECK-NEXT:    movl $-1028477379, %ecx # imm = 0xC2B2AE3D
-; CHECK-NEXT:    movl $668265295, %esi # imm = 0x27D4EB4F
+; CHECK-NEXT:    movl $-1028477379, %ebx # imm = 0xC2B2AE3D
+; CHECK-NEXT:    movl $668265295, %ecx # imm = 0x27D4EB4F
 ; CHECK-NEXT:    movl a, %edi
 ; CHECK-NEXT:    cmpl $0, (%edi)
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
-; CHECK-NEXT:    movl 8(%edi), %ecx
+; CHECK-NEXT:    movl 8(%edi), %esi
 ; CHECK-NEXT:    movl 12(%edi), %edx
 ; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    shldl $1, %ecx, %eax
+; CHECK-NEXT:    shldl $1, %esi, %eax
 ; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    leal (%ecx,%ecx), %edx
-; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    leal (%esi,%esi), %edx
+; CHECK-NEXT:    orl %esi, %edx
 ; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%edi), %ebx
-; CHECK-NEXT:    movl 20(%edi), %edx
+; CHECK-NEXT:    movl 16(%edi), %edx
 ; CHECK-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    shldl $2, %ebx, %edx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    shldl $31, %ebx, %ecx
-; CHECK-NEXT:    shll $2, %ebx
-; CHECK-NEXT:    orl %ecx, %ebx
-; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT:    shrl %ecx
-; CHECK-NEXT:    orl %edx, %ecx
-; CHECK-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; CHECK-NEXT:    movl 20(%edi), %esi
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    shldl $2, %edx, %ebx
 ; CHECK-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    adcl %eax, %ecx
-; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    shldl $31, %edx, %ebx
+; CHECK-NEXT:    shll $2, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    shrl %esi
+; CHECK-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-NEXT:    adcl %eax, %esi
+; CHECK-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl 24(%edi), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; CHECK-NEXT:    movl $-1028477379, %ebx # imm = 0xC2B2AE3D
 ; CHECK-NEXT:    imull %eax, %ebx
-; CHECK-NEXT:    mull %esi
-; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    mull %ecx
+; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    addl %ebx, %edx
 ; CHECK-NEXT:    movl 28(%edi), %edi
-; CHECK-NEXT:    imull %edi, %esi
-; CHECK-NEXT:    addl %edx, %esi
+; CHECK-NEXT:    imull %edi, %ecx
+; CHECK-NEXT:    addl %edx, %ecx
 ; CHECK-NEXT:    movl $1336530590, %edx # imm = 0x4FA9D69E
 ; CHECK-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; CHECK-NEXT:    movl %ebx, %eax
@@ -71,17 +73,17 @@ define i64 @fn1() #0 {
 ; CHECK-NEXT:    addl %edx, %ebx
 ; CHECK-NEXT:    imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
 ; CHECK-NEXT:    addl %ebx, %edx
-; CHECK-NEXT:    shrdl $3, %esi, %ecx
-; CHECK-NEXT:    sarl $3, %esi
-; CHECK-NEXT:    orl %edx, %esi
-; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    shrdl $3, %ecx, %esi
+; CHECK-NEXT:    sarl $3, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    orl %eax, %esi
 ; CHECK-NEXT:    movl $-66860409, %ebx # imm = 0xFC03CA87
-; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    mull %ebx
 ; CHECK-NEXT:    movl %eax, %edi
-; CHECK-NEXT:    imull $326129324, %ecx, %eax # imm = 0x137056AC
+; CHECK-NEXT:    imull $326129324, %esi, %eax # imm = 0x137056AC
 ; CHECK-NEXT:    addl %edx, %eax
-; CHECK-NEXT:    imull $-66860409, %esi, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
 ; CHECK-NEXT:    addl %eax, %ecx
 ; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; CHECK-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -94,14 +96,14 @@ define i64 @fn1() #0 {
 ; CHECK-NEXT:    imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.else
-; CHECK-NEXT:    xorl b+4, %ecx
-; CHECK-NEXT:    xorl b, %esi
+; CHECK-NEXT:    xorl b+4, %ebx
+; CHECK-NEXT:    xorl b, %ecx
 ; CHECK-NEXT:    movl $1419758215, %edx # imm = 0x549FCA87
-; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    mull %edx
-; CHECK-NEXT:    imull $93298681, %esi, %esi # imm = 0x58F9FF9
+; CHECK-NEXT:    imull $93298681, %ecx, %esi # imm = 0x58F9FF9
 ; CHECK-NEXT:    addl %edx, %esi
-; CHECK-NEXT:    imull $1419758215, %ecx, %ecx # imm = 0x549FCA87
+; CHECK-NEXT:    imull $1419758215, %ebx, %ecx # imm = 0x549FCA87
 ; CHECK-NEXT:  .LBB0_3: # %if.end
 ; CHECK-NEXT:    addl %esi, %ecx
 ; CHECK-NEXT:    addl $-1028477341, %eax # imm = 0xC2B2AE63
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 5263e0d4f6f39f..8d51c17d49342f 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -1777,14 +1777,14 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
@@ -1795,8 +1795,8 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
@@ -1913,14 +1913,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __eqtf2
@@ -1931,8 +1931,8 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __unordtf2
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index 536a1ae3b918de..91967ed42ef523 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -245,11 +245,11 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r14, %rax
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovbq %rcx, %r15
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rbp # imm = 0x8000000000000000
+; CHECK-NEXT:    cmovbq %rbp, %r15
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmovaq %rbp, %r15
+; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovaq %rcx, %r15
 ; CHECK-NEXT:    movq $-1, %rcx
 ; CHECK-NEXT:    cmovaq %rcx, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
@@ -265,10 +265,10 @@ define <4 x i128> @test_signed_v4i128_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r14, %r12
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovbq %rax, %r13
+; CHECK-NEXT:    cmovbq %rbp, %r13
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %rbp, %r13
+; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    cmovaq %rax, %r13
 ; CHECK-NEXT:    movq $-1, %rax
 ; CHECK-NEXT:    cmovaq %rax, %r12
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
@@ -1187,8 +1187,8 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rbp # imm = 0x8000000000000000
-; CHECK-NEXT:    cmovbq %rbp, %rdx
+; CHECK-NEXT:    movabsq $-9223372036854775808, %r14 # imm = 0x8000000000000000
+; CHECK-NEXT:    cmovbq %r14, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movabsq $9223372036854775807, %r15 # imm = 0x7FFFFFFFFFFFFFFF
 ; CHECK-NEXT:    cmovaq %r15, %rdx
@@ -1209,11 +1209,11 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %rbp, %rdx
+; CHECK-NEXT:    cmovbq %r14, %rdx
+; CHECK-NEXT:    movq %r14, %rbp
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovaq %r15, %rdx
 ; CHECK-NEXT:    cmovaq %r13, %rax
-; CHECK-NEXT:    movq $-1, %r14
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
 ; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1228,11 +1228,12 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %rbp, %rdx
+; CHECK-NEXT:    cmovbq %r14, %rdx
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovaq %r15, %rdx
-; CHECK-NEXT:    cmovaq %r14, %rax
+; CHECK-NEXT:    movq %r15, %r13
 ; CHECK-NEXT:    movq $-1, %r14
+; CHECK-NEXT:    cmovaq %r14, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
 ; CHECK-NEXT:    cmovpq %r12, %rax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1247,10 +1248,10 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r12, %rax
+; CHECK-NEXT:    movq %rbp, %r15
 ; CHECK-NEXT:    cmovbq %rbp, %rdx
-; CHECK-NEXT:    movq %rbp, %r13
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r15, %rdx
+; CHECK-NEXT:    cmovaq %r13, %rdx
 ; CHECK-NEXT:    cmovaq %r14, %rax
 ; CHECK-NEXT:    movq $-1, %r14
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
@@ -1268,10 +1269,9 @@ define <8 x i128> @test_signed_v8i128_v8f16(<8 x half> %f) nounwind {
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    cmovbq %r12, %rax
-; CHECK-NEXT:    cmovbq %r13, %rbp
+; CHECK-NEXT:    cmovbq %r15, %rbp
 ; CHECK-NEXT:    ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cmovaq %r15, %rbp
-; CHECK-NEXT:    movq %r15, %r13
+; CHECK-NEXT:    cmovaq %r13, %rbp
 ; CHECK-NEXT:    cmovaq %r14, %rax
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm0
 ; CHECK-NEXT:    cmovpq %r12, %rax
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 4340f8fd484aeb..025ee5a65775f1 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -315,66 +315,61 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    pushl %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    testb $64, %cl
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
 ; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl %edi, %edx
-; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl %ebx, %edx
+; X86-SLOW-NEXT:    movl %esi, %ebx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    jne .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    je .LBB6_4
+; X86-SLOW-NEXT:    jmp .LBB6_5
 ; X86-SLOW-NEXT:  .LBB6_1:
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    je .LBB6_4
+; X86-SLOW-NEXT:    jne .LBB6_5
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %edi, %esi
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    movl %ebp, %ebx
+; X86-SLOW-NEXT:    movl %edx, %ebp
+; X86-SLOW-NEXT:    movl %eax, %edx
 ; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SLOW-NEXT:    leal (%ebp,%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    orl %edx, %eax
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    leal (%edi,%edi), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shll %cl, %edx
 ; X86-SLOW-NEXT:    orl %ebp, %edx
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    shrl %cl, %ebx
 ; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-SLOW-NEXT:    orl %ebx, %ebp
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %edi, %esi
 ; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -383,7 +378,7 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
 ; X86-SLOW-NEXT:    movl %eax, (%ecx)
 ; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    addl $4, %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 595f8491b405c9..f8b01f5f3056ff 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -883,6 +883,8 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noin
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    pushl %esi
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
+; X86-SSE1-NEXT:    pushl %eax
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 24
 ; X86-SSE1-NEXT:    .cfi_offset %esi, -20
 ; X86-SSE1-NEXT:    .cfi_offset %edi, -16
 ; X86-SSE1-NEXT:    .cfi_offset %ebx, -12
@@ -894,13 +896,17 @@ define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noin
 ; X86-SSE1-NEXT:    movl 7(%ecx), %edi
 ; X86-SSE1-NEXT:    movzwl 11(%ecx), %ebx
 ; X86-SSE1-NEXT:    movzbl 13(%ecx), %edx
+; X86-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SSE1-NEXT:    movzbl 15(%ecx), %ecx
+; X86-SSE1-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; X86-SSE1-NEXT:    movb %dl, 13(%eax)
 ; X86-SSE1-NEXT:    movb %cl, 15(%eax)
 ; X86-SSE1-NEXT:    movw %bx, 11(%eax)
 ; X86-SSE1-NEXT:    movl %edi, 7(%eax)
 ; X86-SSE1-NEXT:    movl %esi, 3(%eax)
 ; X86-SSE1-NEXT:    movw %bp, (%eax)
+; X86-SSE1-NEXT:    addl $4, %esp
+; X86-SSE1-NEXT:    .cfi_def_cfa_offset 20
 ; X86-SSE1-NEXT:    popl %esi
 ; X86-SSE1-NEXT:    .cfi_def_cfa_offset 16
 ; X86-SSE1-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll
index bb93e34fda7c4d..c821412d130bd7 100644
--- a/llvm/test/CodeGen/X86/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/mul-i1024.ll
@@ -10,14 +10,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $400, %esp # imm = 0x190
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl 60(%eax), %ebp
-; X86-NEXT:    movl 56(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 56(%eax), %ebx
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
@@ -27,10 +28,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 4(%eax), %edi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -44,108 +46,102 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 48(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl 48(%ecx), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl 52(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
@@ -160,9 +156,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 44(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -172,11 +167,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -184,9 +179,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 32(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -198,63 +192,58 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
@@ -265,35 +254,33 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -317,7 +304,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl 20(%eax), %ecx
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -336,153 +322,145 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 24(%eax), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl 24(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 28(%eax), %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -492,8 +470,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -507,81 +484,77 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
@@ -610,9 +583,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 28(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -626,18 +598,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 16(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ecx), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -649,94 +619,90 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl 8(%ecx), %ebx
 ; X86-NEXT:    movl %ebx, %eax
@@ -745,9 +711,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 12(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -757,11 +722,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -776,8 +741,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 4(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl 4(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -791,55 +756,50 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
@@ -850,51 +810,48 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl %ebp, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -903,137 +860,127 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    setb %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -1054,85 +1001,81 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebx
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
@@ -1143,26 +1086,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    addl %esi, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1176,12 +1117,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1190,15 +1131,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -1208,138 +1149,130 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 32(%edi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%edi), %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl 36(%eax), %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 40(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl 40(%eax), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 44(%eax), %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
@@ -1370,7 +1303,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1378,71 +1311,67 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
@@ -1453,67 +1382,62 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 48(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 48(%ebx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 52(%eax), %ebp
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 52(%eax), %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
@@ -1521,8 +1445,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -1533,115 +1458,112 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 56(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 56(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl 60(%esi), %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 60(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
@@ -1667,7 +1589,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1676,9 +1598,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -1692,90 +1613,85 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -1784,13 +1700,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -1801,25 +1715,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -1830,139 +1745,131 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -1991,7 +1898,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2017,94 +1924,87 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %bl, %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
-; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2143,119 +2043,110 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2286,7 +2177,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -2309,84 +2200,75 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl %bl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    adcl %eax, %ebp
@@ -2409,7 +2291,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -2440,81 +2322,75 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 64(%eax), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 64(%ebx), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 68(%eax), %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 68(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl 72(%eax), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
@@ -2522,163 +2398,153 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl 76(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
@@ -2689,44 +2555,41 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl %ebp, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 80(%eax), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 80(%edi), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2737,11 +2600,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 84(%eax), %ecx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 84(%edi), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2760,124 +2621,117 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 88(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 88(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 92(%eax), %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -2887,8 +2741,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -2902,7 +2755,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2923,90 +2776,83 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
@@ -3033,8 +2879,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -3044,25 +2889,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -3075,73 +2919,69 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
@@ -3160,58 +3000,56 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    imull %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %eax
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    imull %ecx, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3220,62 +3058,61 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl 104(%ecx), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3313,6 +3150,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl 100(%esi), %ebp
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -3327,65 +3165,60 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
@@ -3413,52 +3246,52 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl 116(%esi), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl 120(%esi), %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 124(%ecx), %ecx
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 124(%esi), %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ebp
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %ebp
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -3468,14 +3301,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %edi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -3485,33 +3315,31 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %bl, %edi
 ; X86-NEXT:    adcl %edi, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -3535,7 +3363,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -3563,9 +3391,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 92(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -3575,11 +3402,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3589,13 +3416,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl 80(%ecx), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 84(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl 84(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -3609,81 +3436,75 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
@@ -3691,103 +3512,98 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 72(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 72(%ecx), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl 76(%ecx), %esi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 76(%ecx), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 64(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 64(%edi), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl 68(%esi), %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl 68(%edi), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -3797,44 +3613,44 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -3842,64 +3658,62 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
@@ -3910,57 +3724,53 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
@@ -3974,13 +3784,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -3988,13 +3798,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
@@ -4007,17 +3816,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
@@ -4028,89 +3837,84 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4130,14 +3934,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 96(%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl 96(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
@@ -4145,12 +3948,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 100(%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 100(%ebp), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4163,105 +3964,102 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 104(%eax), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 104(%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 108(%eax), %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 108(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
@@ -4270,50 +4068,47 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    imull %ebx, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
@@ -4322,53 +4117,52 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl 120(%ebx), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 120(%edi), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    movl 124(%ebx), %eax
+; X86-NEXT:    movl 124(%edi), %eax
 ; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movl 112(%ebx), %edi
-; X86-NEXT:    movl 116(%ebx), %ebp
+; X86-NEXT:    movl 112(%edi), %ecx
+; X86-NEXT:    movl 116(%edi), %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %edi
+; X86-NEXT:    imull %edi, %ebx
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
@@ -4376,13 +4170,13 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -4391,25 +4185,24 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
@@ -4429,73 +4222,68 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
 ; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
@@ -4517,45 +4305,41 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    imull %eax, %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebx
@@ -4563,7 +4347,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
@@ -4572,35 +4356,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %esi, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    imull %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %esi
@@ -4623,35 +4406,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4659,14 +4441,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4683,9 +4463,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4713,24 +4493,27 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4740,8 +4523,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -4783,18 +4564,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, 64(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 68(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 72(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 76(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 80(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 84(%ecx)
-; X86-NEXT:    movl %ebp, 88(%ecx)
+; X86-NEXT:    movl %edi, 72(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 76(%ecx)
+; X86-NEXT:    movl %ebp, 80(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 84(%ecx)
+; X86-NEXT:    movl %ebx, 88(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 92(%ecx)
-; X86-NEXT:    movl %ebx, 96(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 96(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 100(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4803,7 +4583,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, 108(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 112(%ecx)
-; X86-NEXT:    movl %edi, 116(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 116(%ecx)
 ; X86-NEXT:    movl %edx, 120(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 124(%ecx)
@@ -4822,39 +4603,41 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $240, %rsp
+; X64-NEXT:    subq $224, %rsp
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq 40(%rdi), %rbx
 ; X64-NEXT:    movq 32(%rdi), %r12
-; X64-NEXT:    movq 56(%rdi), %r15
+; X64-NEXT:    movq 56(%rdi), %r14
 ; X64-NEXT:    movq 48(%rdi), %r10
 ; X64-NEXT:    movq (%rsi), %r11
-; X64-NEXT:    movq 8(%rsi), %r14
+; X64-NEXT:    movq 8(%rsi), %r8
 ; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rdi
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %r8, %rcx
+; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %r9, %r8
 ; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %r9d
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %r14
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r10, %rsi
@@ -4904,30 +4687,31 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rdi, %r12
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq 24(%r13), %rbp
+; X64-NEXT:    movq 24(%r13), %rdi
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, %r10
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    addq %r12, %rax
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    adcq %r9, %r13
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %r13, %r9
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    movzbl %bpl, %eax
 ; X64-NEXT:    adcq %rax, %rdi
 ; X64-NEXT:    addq %r15, %r14
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %r12
-; X64-NEXT:    movq %r12, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    addq %rsi, %r9
 ; X64-NEXT:    adcq %rcx, %rdi
-; X64-NEXT:    setb %r10b
+; X64-NEXT:    setb %r12b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -4942,15 +4726,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rbp
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movzbl %sil, %eax
@@ -4959,7 +4743,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rdi, %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    movzbl %r12b, %eax
 ; X64-NEXT:    adcq %rax, %rcx
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdx
@@ -4999,8 +4783,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq 8(%r14), %r14
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq 8(%r14), %rax
+; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %r12
@@ -5013,8 +4797,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r15, %rsi
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r14, %r15
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rax, %r13
@@ -5031,8 +4815,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rbx
@@ -5066,7 +4850,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %r8, %rdi
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rdi
@@ -5083,23 +4867,23 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %rdi, %r13
 ; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %r9
+; X64-NEXT:    movq %r8, %rbx
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movzbl %dil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
-; X64-NEXT:    addq %rsi, %rbx
+; X64-NEXT:    addq %rsi, %r9
 ; X64-NEXT:    adcq %r15, %r11
 ; X64-NEXT:    movzbl %r10b, %ecx
 ; X64-NEXT:    adcq %rcx, %rax
 ; X64-NEXT:    adcq $0, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq (%rsp), %rdx # 8-byte Folded Reload
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq $0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
@@ -5111,145 +4895,147 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rsi, %r11
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq 40(%r8), %rbx
+; X64-NEXT:    movq 40(%r8), %r9
 ; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %r11, %rsi
 ; X64-NEXT:    adcq %rdi, %r15
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %r15, %r11
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %r15, %rbp
 ; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %rbx, %rcx
-; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r14, %rbx
+; X64-NEXT:    movq %r9, (%rsp) # 8-byte Spill
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    addq %rbp, %rax
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r13, %r10
-; X64-NEXT:    setb %bl
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    setb %r14b
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %r10, %rbp
-; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    movzbl %r14b, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r12, %rbp
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    adcq %rsi, %r15
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq 48(%r8), %rcx
-; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    movq %rbx, %r14
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rbx
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, %r12
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r12, %r9
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %rsi, %r13
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq 56(%r8), %rsi
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq 56(%r8), %r12
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rax, %r9
-; X64-NEXT:    addq %r13, %r9
+; X64-NEXT:    movq %rax, %r8
+; X64-NEXT:    addq %r13, %r8
 ; X64-NEXT:    adcq %r10, %r14
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    movq %r9, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r14, %r13
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    addq %rbp, %rbx
-; X64-NEXT:    adcq %r15, %r9
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rbx
+; X64-NEXT:    adcq %r15, %rbx
 ; X64-NEXT:    adcq $0, %r13
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    addq %r11, %r13
 ; X64-NEXT:    adcq %rdi, %rsi
-; X64-NEXT:    setb %r11b
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %r12
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %rbp
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq $0, %r10
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    addq %r14, %rbp
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    addq %r14, %rcx
 ; X64-NEXT:    adcq %r10, %r8
 ; X64-NEXT:    setb %r10b
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %r8, %rdi
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r13, %r12
-; X64-NEXT:    adcq %rsi, %rbp
-; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    addq %r13, %r9
+; X64-NEXT:    adcq %rsi, %rcx
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rax, %rdi
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    addq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    adcq %rax, (%rsp) # 8-byte Folded Spill
+; X64-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %rbp
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
 ; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
@@ -5258,7 +5044,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r14
@@ -5267,7 +5053,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %r8
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %r11 # 8-byte Reload
 ; X64-NEXT:    mulq %r11
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rbx
@@ -5309,18 +5095,18 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %r8, %r13
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbp, %r13
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq %rbx, %r12
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    movq %r11, %rbx
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rbp
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %r8, %r10
@@ -5398,40 +5184,38 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; X64-NEXT:    movq 64(%r13), %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq 64(%r13), %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rsi, %r8
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq 72(%r13), %rsi
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r8, %rbx
 ; X64-NEXT:    adcq %rdi, %r10
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %r10, %r9
 ; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r15, %rcx
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
@@ -5441,8 +5225,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r8, %r14
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    movq %r12, %rcx
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, %rcx
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    addq %r14, %rax
@@ -5457,15 +5241,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %r8, %rbp
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %r11, %rbp
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    adcq %rbx, %r15
 ; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq 80(%r13), %r14
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    movq %rax, %rsi
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r10
@@ -5487,8 +5271,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %r8, %r13
 ; X64-NEXT:    movzbl %r10b, %eax
 ; X64-NEXT:    adcq %rax, %r12
-; X64-NEXT:    addq %rbp, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
 ; X64-NEXT:    adcq %r15, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %r13
@@ -5500,36 +5283,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rdi
-; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; X64-NEXT:    movq %r11, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rdi, %r10
 ; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq %r9, %r15
 ; X64-NEXT:    mulq %rbx
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    addq %r10, %rax
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    adcq %r8, %rdi
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, %r9
 ; X64-NEXT:    mulq %rbx
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq %rax, %r15
+; X64-NEXT:    addq %rdi, %r15
 ; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %r13, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %r13, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r12, %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl %bpl, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rax, %r15
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@@ -5538,23 +5319,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rbx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    imulq %rcx, %r14
+; X64-NEXT:    movq (%rsp), %r13 # 8-byte Reload
+; X64-NEXT:    imulq %r13, %r14
 ; X64-NEXT:    addq %rdx, %r14
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    imulq %rsi, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %r10, %rdx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    imulq %r11, %rbx
+; X64-NEXT:    imulq %rcx, %rbx
 ; X64-NEXT:    addq %rdx, %rbx
 ; X64-NEXT:    addq %r8, %rdi
 ; X64-NEXT:    adcq %r14, %rbx
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %rcx, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -5564,15 +5344,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r8, %r14
 ; X64-NEXT:    adcq $0, %r10
-; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r8
-; X64-NEXT:    addq %r14, %rax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %r14, %r11
 ; X64-NEXT:    adcq %r10, %r8
 ; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %rcx
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r8, %r14
@@ -5580,23 +5360,23 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %rax, %rsi
 ; X64-NEXT:    addq %rdi, %r14
 ; X64-NEXT:    adcq %rbx, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq 112(%rcx), %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq 112(%r8), %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    imulq %r11, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    imulq %rcx, %r10
 ; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq 120(%rcx), %rax
+; X64-NEXT:    movq 120(%r8), %rax
 ; X64-NEXT:    imulq %rdi, %rax
 ; X64-NEXT:    movq %rdi, %r12
 ; X64-NEXT:    addq %rax, %r10
-; X64-NEXT:    movq 96(%rcx), %r13
-; X64-NEXT:    movq 104(%rcx), %r8
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, %rbx
+; X64-NEXT:    movq 96(%r8), %r13
+; X64-NEXT:    movq 104(%r8), %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    imulq %r8, %rbx
 ; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %rdi
@@ -5605,11 +5385,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rdx, %r9
 ; X64-NEXT:    addq %rbp, %rdi
 ; X64-NEXT:    adcq %r10, %r9
-; X64-NEXT:    movq %r9, %r15
 ; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %rbp
@@ -5617,84 +5396,83 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %r10, %r12
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r12, %r13
 ; X64-NEXT:    adcq %rbp, %r10
-; X64-NEXT:    setb %bl
+; X64-NEXT:    setb %bpl
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    addq %r10, %rax
-; X64-NEXT:    movzbl %bl, %r8d
+; X64-NEXT:    movzbl %bpl, %r8d
 ; X64-NEXT:    adcq %r8, %rdx
 ; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    adcq %r15, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; X64-NEXT:    adcq %r9, %rdx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    adcq %r11, %r13
 ; X64-NEXT:    adcq %r14, %rax
 ; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq %r15, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    movq 80(%r14), %r10
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    movq 80(%rbx), %r11
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq 88(%r14), %r15
+; X64-NEXT:    movq 88(%rbx), %r15
 ; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    adcq $0, %r8
-; X64-NEXT:    movq %r10, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %r9, %rdi
 ; X64-NEXT:    adcq %r8, %rcx
 ; X64-NEXT:    setb %r8b
 ; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    mulq %r13
+; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rcx, %r12
 ; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    movq 64(%r14), %rcx
+; X64-NEXT:    adcq %rax, %r9
+; X64-NEXT:    movq 64(%rbx), %rcx
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    mulq %rbx
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %r11
-; X64-NEXT:    movq 72(%r14), %r8
+; X64-NEXT:    movq 72(%rbx), %r8
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %r11, %r14
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rcx, %r10
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %r11
 ; X64-NEXT:    setb %cl
 ; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r10
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %r11, %rbp
@@ -5703,79 +5481,78 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rsi, %rbp
 ; X64-NEXT:    adcq %rdi, %rbx
 ; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq %r9, %rcx
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    adcq $0, %r9
+; X64-NEXT:    movq %r10, %r13
+; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rsi
-; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    movq %r8, %r10
 ; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rsi, %r11
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    addq %r11, %rax
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    adcq %rdi, %r13
-; X64-NEXT:    setb %cl
+; X64-NEXT:    setb %r14b
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    addq %r13, %rdi
-; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    movzbl %r14b, %eax
 ; X64-NEXT:    adcq %rax, %rsi
-; X64-NEXT:    addq %rbp, %r9
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbp, %rcx
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %r11
 ; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    adcq $0, %rsi
 ; X64-NEXT:    addq %r12, %rdi
-; X64-NEXT:    adcq %r15, %rsi
+; X64-NEXT:    adcq %r9, %rsi
 ; X64-NEXT:    setb %cl
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r9
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %r9, %rbx
 ; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    movq %r8, %r9
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    addq %rbx, %rax
-; X64-NEXT:    movq %rax, %r10
+; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    adcq %r11, %r13
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r9
-; X64-NEXT:    addq %r13, %rax
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %r13, %rbp
 ; X64-NEXT:    movzbl %r8b, %eax
 ; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %rdi, %r15
-; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %rsi, %r10
+; X64-NEXT:    addq %rdi, %r10
 ; X64-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rsi, %rbx
+; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq %rax, %rbp
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
@@ -5793,36 +5570,36 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq 112(%rcx), %rax
 ; X64-NEXT:    movq %rcx, %r14
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; X64-NEXT:    imulq %r10, %rcx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    mulq %rbx
+; X64-NEXT:    imulq %rbx, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rcx, %rdx
 ; X64-NEXT:    movq 120(%r14), %r13
-; X64-NEXT:    imulq %rbx, %r13
+; X64-NEXT:    imulq %r10, %r13
 ; X64-NEXT:    addq %rdx, %r13
 ; X64-NEXT:    addq %rdi, %r8
 ; X64-NEXT:    adcq %r11, %r13
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %rbx, %rcx
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    movq %rbx, %r14
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %r11
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rdi, %rbx
 ; X64-NEXT:    adcq $0, %r11
-; X64-NEXT:    movq %rcx, %rax
+; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r12
 ; X64-NEXT:    addq %rbx, %r12
 ; X64-NEXT:    adcq %r11, %rcx
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r10, %rax
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r9
@@ -5840,56 +5617,55 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %rdi, %rdx
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r14, %rax
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
 ; X64-NEXT:    addq %rdx, %rax
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; X64-NEXT:    movq %rax, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    imulq %r8, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT:    imulq %r14, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rax, %r11
 ; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    imulq %rdi, %rbp
-; X64-NEXT:    addq %rdx, %rbp
+; X64-NEXT:    imulq %rdi, %r15
+; X64-NEXT:    addq %rdx, %r15
 ; X64-NEXT:    addq %rcx, %r11
-; X64-NEXT:    adcq %r13, %rbp
+; X64-NEXT:    adcq %r13, %r15
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    addq %rcx, %rsi
 ; X64-NEXT:    adcq $0, %r8
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rdi
 ; X64-NEXT:    movq %rax, %rcx
 ; X64-NEXT:    addq %rsi, %rcx
 ; X64-NEXT:    adcq %r8, %rdi
 ; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %r15, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    movzbl %sil, %esi
 ; X64-NEXT:    adcq %rsi, %rdx
 ; X64-NEXT:    addq %r11, %rax
-; X64-NEXT:    adcq %rbp, %rdx
+; X64-NEXT:    adcq %r15, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq %r12, %rcx
 ; X64-NEXT:    adcq %r9, %rax
 ; X64-NEXT:    adcq %rbx, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq %rbp, %rax
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq (%rsp), %rsi # 8-byte Reload
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
@@ -5922,7 +5698,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rdi, 24(%rsi)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, 32(%rsi)
-; X64-NEXT:    movq (%rsp), %rdi # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, 40(%rsi)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, 48(%rsi)
@@ -5936,7 +5712,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rcx, 104(%rsi)
 ; X64-NEXT:    movq %rax, 112(%rsi)
 ; X64-NEXT:    movq %rdx, 120(%rsi)
-; X64-NEXT:    addq $240, %rsp
+; X64-NEXT:    addq $224, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll
index 2d7737bfdd3c2e..54020422bf761e 100644
--- a/llvm/test/CodeGen/X86/mul-i256.ll
+++ b/llvm/test/CodeGen/X86/mul-i256.ll
@@ -22,7 +22,7 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %edi
@@ -58,6 +58,7 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl 4(%esi), %ebp
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -72,86 +73,79 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, %esi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl 8(%eax), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ebp
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
@@ -165,7 +159,7 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    movl 20(%ecx), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -174,8 +168,7 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl 24(%ecx), %eax
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %ecx, %edx
@@ -183,20 +176,21 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl 28(%ecx), %ecx
 ; X86-NEXT:    imull %ebx, %ecx
 ; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
@@ -209,28 +203,29 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %bl, %esi
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl 24(%edi), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movl 28(%edi), %eax
-; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    movl 16(%edi), %edi
-; X86-NEXT:    movl 20(%edx), %ebp
+; X86-NEXT:    movl 20(%edx), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    imull %ebp, %ebx
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull %ecx, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -241,35 +236,32 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 2421aabdbcd994..47f01a07c19198 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -9,20 +9,20 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $180, %esp
+; X86-NEXT:    subl $184, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 28(%eax), %ebx
-; X86-NEXT:    movl 24(%eax), %ebp
+; X86-NEXT:    movl 28(%eax), %ebp
+; X86-NEXT:    movl 24(%eax), %ebx
 ; X86-NEXT:    movl (%edx), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -30,25 +30,25 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl 4(%eax), %edi
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 16(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 16(%ecx), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -60,95 +60,88 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 8(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl 8(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 12(%eax), %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
@@ -156,40 +149,40 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 8(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ecx), %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 12(%ecx), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 12(%ecx), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esi), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -201,58 +194,55 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
 ; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %esi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -262,73 +252,71 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebp, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    adcl %ebx, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 16(%eax), %ecx
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 16(%ebx), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 20(%eax), %esi
+; X86-NEXT:    movl 20(%ebx), %esi
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -341,122 +329,117 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 24(%eax), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl 24(%eax), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 28(%eax), %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%eax), %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %edi, %ebp
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
@@ -484,7 +467,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -502,91 +485,84 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %cl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, %esi
-; X86-NEXT:    mull %ebx
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl %edi, %esi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    adcl %ecx, %esi
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull (%esp) # 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    adcl %ebx, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -606,14 +582,13 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 32(%eax), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ecx
@@ -621,12 +596,10 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl 36(%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 36(%ebp), %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -661,84 +634,79 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl 40(%edi), %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 40(%eax), %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl 44(%edi), %edi
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 44(%eax), %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
@@ -746,37 +714,37 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    imull %edi, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-NEXT:    imull %ecx, %esi
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    adcl (%esp), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
@@ -795,56 +763,57 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl 56(%edi), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    imull %ebp, %eax
+; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movl 48(%edi), %esi
-; X86-NEXT:    movl 52(%edi), %edi
+; X86-NEXT:    movl 48(%edx), %esi
+; X86-NEXT:    movl 52(%edx), %edx
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    imull %edi, %ebx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    imull %esi, %ebx
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    addl %ebx, %ebp
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
@@ -852,13 +821,13 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -871,9 +840,8 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 44(%ecx), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ecx), %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -883,11 +851,11 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -897,13 +865,13 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl 32(%esi), %ebx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl 36(%esi), %ebp
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl 36(%esi), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
@@ -916,74 +884,69 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %esi, %eax
@@ -1011,14 +974,13 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    movl 56(%esi), %eax
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl 60(%esi), %esi
+; X86-NEXT:    movl 60(%ebp), %esi
 ; X86-NEXT:    imull %ecx, %esi
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -1028,7 +990,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %ebx
@@ -1052,41 +1014,42 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    imull %esi, %ecx
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    imull %ebp, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    imull %ebx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    imull %ebx, %eax
+; X86-NEXT:    addl %edx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    imull %ebp, %ecx
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ecx, %esi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -1099,7 +1062,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movzbl %bl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -1129,9 +1092,10 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -1157,14 +1121,14 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X86-NEXT:    movl %esi, 32(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, 36(%ecx)
+; X86-NEXT:    movl %ebx, 40(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, 40(%ecx)
-; X86-NEXT:    movl %ebx, 44(%ecx)
+; X86-NEXT:    movl %esi, 44(%ecx)
 ; X86-NEXT:    movl %ebp, 48(%ecx)
 ; X86-NEXT:    movl %edi, 52(%ecx)
 ; X86-NEXT:    movl %eax, 56(%ecx)
 ; X86-NEXT:    movl %edx, 60(%ecx)
-; X86-NEXT:    addl $180, %esp
+; X86-NEXT:    addl $184, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1179,174 +1143,163 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
-; X64-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq (%rdi), %rbx
-; X64-NEXT:    movq 8(%rdi), %rdi
-; X64-NEXT:    movq 24(%rax), %r14
-; X64-NEXT:    movq 16(%rax), %rax
-; X64-NEXT:    movq (%rsi), %r8
-; X64-NEXT:    movq 8(%rsi), %r11
-; X64-NEXT:    movq %rsi, %r13
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdi, %r11
+; X64-NEXT:    movq (%rdi), %r14
+; X64-NEXT:    movq 8(%rdi), %rbp
+; X64-NEXT:    movq 24(%rdi), %r8
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq (%rsi), %r13
+; X64-NEXT:    movq 8(%rsi), %rdi
 ; X64-NEXT:    movq %rax, %rsi
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %rbp
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rcx, %r10
 ; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %r10, %r15
 ; X64-NEXT:    adcq %r9, %rcx
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %esi
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdx, %r8
 ; X64-NEXT:    movq %rax, %r9
 ; X64-NEXT:    addq %rcx, %r9
-; X64-NEXT:    adcq %rsi, %rdx
-; X64-NEXT:    movq %rdx, %r12
-; X64-NEXT:    movq %rbx, %rsi
-; X64-NEXT:    movq %rbx, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    adcq %rsi, %r8
+; X64-NEXT:    movq %r14, %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r8
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %r14
 ; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    movq %rsi, %r8
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %r14, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %rcx
-; X64-NEXT:    setb %sil
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    setb %r12b
+; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
-; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movzbl %r12b, %eax
 ; X64-NEXT:    adcq %rax, %r14
-; X64-NEXT:    addq %rbp, %rbx
+; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
 ; X64-NEXT:    adcq %r15, %r14
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    adcq $0, %r12
-; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r13, %rsi
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq 16(%r13), %r10
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq 16(%rdi), %r10
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r13
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movq %rdi, %r12
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
 ; X64-NEXT:    adcq $0, %r15
-; X64-NEXT:    movq 24(%rsi), %rsi
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq 24(%rdi), %rdi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    addq %rbp, %r11
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    addq %rbp, %r12
 ; X64-NEXT:    adcq %r15, %rcx
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %r15
 ; X64-NEXT:    movq %rax, %rbp
 ; X64-NEXT:    addq %rcx, %rbp
-; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    addq %rbx, %r13
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    adcq %r14, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; X64-NEXT:    adcq %r14, %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq $0, %rbp
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    addq %r9, %rbp
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT:    setb %dil
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %r10
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    adcq %r8, %r15
+; X64-NEXT:    setb %sil
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
 ; X64-NEXT:    movq %r14, %rax
 ; X64-NEXT:    mulq %r10
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %r12
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq $0, %r9
-; X64-NEXT:    movq %r8, %rax
-; X64-NEXT:    mulq %rsi
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %rdi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    addq %rbx, %rax
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    adcq %r9, %rcx
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %rsi
-; X64-NEXT:    addq %rcx, %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %rdi
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rcx, %r14
+; X64-NEXT:    movzbl %r9b, %eax
 ; X64-NEXT:    adcq %rax, %rdx
-; X64-NEXT:    addq %rbp, %r11
-; X64-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    addq %rbp, %r12
+; X64-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %r15, %rbx
 ; X64-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    adcq %rax, %rcx
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    adcq %rax, %r14
 ; X64-NEXT:    adcq $0, %rdx
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT:    movq 32(%r8), %r15
-; X64-NEXT:    imulq %r15, %rsi
+; X64-NEXT:    movq 32(%r11), %r15
+; X64-NEXT:    imulq %r15, %rdi
 ; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r10
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    addq %rsi, %rdx
-; X64-NEXT:    movq 40(%r8), %rsi
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    movq 40(%r11), %rsi
 ; X64-NEXT:    imulq %rsi, %r10
 ; X64-NEXT:    addq %rdx, %r10
-; X64-NEXT:    movq 48(%r8), %rax
+; X64-NEXT:    movq 48(%r11), %rax
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
 ; X64-NEXT:    imulq %r9, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT:    mulq %r11
+; X64-NEXT:    mulq %r13
 ; X64-NEXT:    movq %rax, %rbx
 ; X64-NEXT:    addq %rdi, %rdx
-; X64-NEXT:    movq 56(%r8), %r8
-; X64-NEXT:    imulq %r11, %r8
+; X64-NEXT:    movq 56(%r11), %r8
+; X64-NEXT:    imulq %r13, %r8
 ; X64-NEXT:    addq %rdx, %r8
 ; X64-NEXT:    addq %rcx, %rbx
 ; X64-NEXT:    adcq %r10, %r8
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1356,7 +1309,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    addq %rcx, %r15
 ; X64-NEXT:    adcq $0, %rdi
-; X64-NEXT:    movq %r11, %rax
+; X64-NEXT:    movq %r13, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r13
@@ -1378,8 +1331,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT:    imulq %r14, %rsi
+; X64-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    addq %rdx, %rsi
 ; X64-NEXT:    movq %r8, %rdx
 ; X64-NEXT:    movq 56(%r8), %rax
@@ -1410,14 +1362,15 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    addq %rcx, %rdi
 ; X64-NEXT:    adcq $0, %r15
 ; X64-NEXT:    movq %rbp, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    addq %rdi, %r8
 ; X64-NEXT:    adcq %r15, %rcx
 ; X64-NEXT:    setb %dil
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %rbp
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    movzbl %dil, %ecx
 ; X64-NEXT:    adcq %rcx, %rdx
@@ -1429,9 +1382,9 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    adcq %r12, %rdx
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; X64-NEXT:    adcq %r14, %rax
 ; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; X64-NEXT:    movq (%rsp), %rcx # 8-byte Reload
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; X64-NEXT:    movq %rdi, (%rcx)
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
@@ -1444,7 +1397,6 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
 ; X64-NEXT:    movq %r8, 40(%rcx)
 ; X64-NEXT:    movq %rax, 48(%rcx)
 ; X64-NEXT:    movq %rdx, 56(%rcx)
-; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index beb42f55b709cc..fcfede89968318 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -93,8 +93,8 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  ## %bb.10: ## %do.end
 ; CHECK-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    testb %bpl, %bpl
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_11
 ; CHECK-NEXT:  ## %bb.12: ## %while.body200.preheader
 ; CHECK-NEXT:    xorl %r12d, %r12d
@@ -113,13 +113,13 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    decl %r13d
 ; CHECK-NEXT:    testl %r13d, %r13d
-; CHECK-NEXT:    movl %ebp, %r15d
+; CHECK-NEXT:    movl %ebx, %r15d
 ; CHECK-NEXT:    jle LBB0_21
 ; CHECK-NEXT:  LBB0_13: ## %while.body200
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB0_28 Depth 2
 ; CHECK-NEXT:    ## Child Loop BB0_37 Depth 2
-; CHECK-NEXT:    leal -268(%rbp), %eax
+; CHECK-NEXT:    leal -268(%rbx), %eax
 ; CHECK-NEXT:    cmpl $105, %eax
 ; CHECK-NEXT:    ja LBB0_14
 ; CHECK-NEXT:  ## %bb.55: ## %while.body200
@@ -130,12 +130,12 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  LBB0_25: ## %sw.bb474
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    testb %r12b, %r12b
-; CHECK-NEXT:    ## implicit-def: $rbx
+; CHECK-NEXT:    ## implicit-def: $rbp
 ; CHECK-NEXT:    jne LBB0_33
 ; CHECK-NEXT:  ## %bb.26: ## %do.body479.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    testb %r12b, %r12b
-; CHECK-NEXT:    ## implicit-def: $rbx
+; CHECK-NEXT:    ## implicit-def: $rbp
 ; CHECK-NEXT:    jne LBB0_33
 ; CHECK-NEXT:  ## %bb.27: ## %land.rhs485.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
@@ -144,7 +144,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_31: ## %do.body479.backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB0_28 Depth=2
-; CHECK-NEXT:    leaq 1(%rbx), %rax
+; CHECK-NEXT:    leaq 1(%rbp), %rax
 ; CHECK-NEXT:    testb %r12b, %r12b
 ; CHECK-NEXT:    je LBB0_32
 ; CHECK-NEXT:  LBB0_28: ## %land.rhs485
@@ -154,7 +154,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    jne LBB0_54
 ; CHECK-NEXT:  ## %bb.29: ## %cond.true.i.i2780
 ; CHECK-NEXT:    ## in Loop: Header=BB0_28 Depth=2
-; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    movq %rax, %rbp
 ; CHECK-NEXT:    testb %r12b, %r12b
 ; CHECK-NEXT:    jne LBB0_31
 ; CHECK-NEXT:  ## %bb.30: ## %lor.rhs500
@@ -168,7 +168,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  LBB0_14: ## %while.body200
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    leal 1(%rbp), %eax
+; CHECK-NEXT:    leal 1(%rbx), %eax
 ; CHECK-NEXT:    cmpl $21, %eax
 ; CHECK-NEXT:    ja LBB0_20
 ; CHECK-NEXT:  ## %bb.15: ## %while.body200
@@ -178,7 +178,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    jmpq *%rax
 ; CHECK-NEXT:  LBB0_18: ## %while.cond201.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $1, %ebp
+; CHECK-NEXT:    movl $1, %ebx
 ; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_44: ## %sw.bb1134
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
@@ -188,19 +188,19 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    jb LBB0_54
 ; CHECK-NEXT:  ## %bb.45: ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT:    movl $268, %ebp ## imm = 0x10C
+; CHECK-NEXT:    movl $268, %ebx ## imm = 0x10C
 ; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_39: ## %sw.bb566
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $20, %ebp
+; CHECK-NEXT:    movl $20, %ebx
 ; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_19: ## %sw.bb243
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $2, %ebp
+; CHECK-NEXT:    movl $2, %ebx
 ; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:  LBB0_32: ## %if.end517.loopexitsplit
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    incq %rbx
+; CHECK-NEXT:    incq %rbp
 ; CHECK-NEXT:  LBB0_33: ## %if.end517
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    leal -324(%r15), %eax
@@ -232,7 +232,7 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  ## %bb.38: ## %for.cond542.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    testb %r12b, %r12b
-; CHECK-NEXT:    movb $0, (%rbx)
+; CHECK-NEXT:    movb $0, (%rbp)
 ; CHECK-NEXT:    leaq LJTI0_0(%rip), %rdx
 ; CHECK-NEXT:    jmp LBB0_20
 ; CHECK-NEXT:    .p2align 4
@@ -250,12 +250,12 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  LBB0_11:
 ; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:  LBB0_21: ## %while.end1465
-; CHECK-NEXT:    incl %ebp
-; CHECK-NEXT:    cmpl $16, %ebp
+; CHECK-NEXT:    incl %ebx
+; CHECK-NEXT:    cmpl $16, %ebx
 ; CHECK-NEXT:    ja LBB0_49
 ; CHECK-NEXT:  ## %bb.22: ## %while.end1465
 ; CHECK-NEXT:    movl $83969, %eax ## imm = 0x14801
-; CHECK-NEXT:    btl %ebp, %eax
+; CHECK-NEXT:    btl %ebx, %eax
 ; CHECK-NEXT:    jae LBB0_49
 ; CHECK-NEXT:  ## %bb.23:
 ; CHECK-NEXT:    xorl %ebx, %ebx
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 874913629e9e3f..3abc40202e7b63 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -369,16 +369,17 @@ define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    shll $19, %eax
 ; X86-NEXT:    sarl $19, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll $19, %ecx
 ; X86-NEXT:    sarl $19, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    sbbl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -386,14 +387,14 @@ define i8 @scmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X86-NEXT:    sbbl %ebp, %esi
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sbbl %eax, %esi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    setl {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    sbbl %edi, %ebp
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setl %al
-; X86-NEXT:    subb %bl, %al
+; X86-NEXT:    subb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT:    addl $4, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -858,7 +859,7 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
@@ -904,73 +905,72 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %bh
-; X86-NEXT:    subb %al, %bh
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %bl
-; X86-NEXT:    subb %al, %bl
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dh
-; X86-NEXT:    subb %al, %dh
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %edi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %ebp
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %ebp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %edi
+; X86-NEXT:    setg %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movsbl %cl, %esi
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    setg %ah
 ; X86-NEXT:    subb %al, %ah
-; X86-NEXT:    movsbl %ah, %esi
+; X86-NEXT:    movsbl %ah, %edx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    setl %al
-; X86-NEXT:    setg %dl
-; X86-NEXT:    subb %al, %dl
-; X86-NEXT:    movsbl %dl, %ecx
+; X86-NEXT:    setg %ah
+; X86-NEXT:    subb %al, %ah
+; X86-NEXT:    movsbl %ah, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %ecx, 60(%eax)
-; X86-NEXT:    movl %esi, 56(%eax)
-; X86-NEXT:    movl %edi, 52(%eax)
+; X86-NEXT:    movl %edx, 56(%eax)
+; X86-NEXT:    movl %esi, 52(%eax)
 ; X86-NEXT:    movl %ebp, 48(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
+; X86-NEXT:    movl %edi, 44(%eax)
+; X86-NEXT:    movl %ebx, 40(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movsbl %dh, %edx
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
 ; X86-NEXT:    movl %edx, 36(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bl, %esi
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    movl %esi, 32(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bh, %edi
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    movl %edi, 28(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
@@ -983,7 +983,7 @@ define <16 x i32> @scmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1641,32 +1641,32 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    subb %bl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %ecx, %ebp
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %ebp, %ecx
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    setl %al
-; X86-NEXT:    cmpl %ecx, %ebp
+; X86-NEXT:    setl %bl
+; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    setl %cl
-; X86-NEXT:    subb %al, %cl
-; X86-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X86-NEXT:    setl %al
+; X86-NEXT:    subb %bl, %al
+; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    cmpl %eax, %ecx
@@ -1677,49 +1677,49 @@ define <16 x i8> @scmp_wide_vec_op(<16 x i64> %x, <16 x i64> %y) nounwind {
 ; X86-NEXT:    setl %dl
 ; X86-NEXT:    cmpl %ecx, %eax
 ; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    setl %ch
-; X86-NEXT:    subb %dl, %ch
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    setl %bh
+; X86-NEXT:    subb %dl, %bh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %edx, %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    setl %dl
-; X86-NEXT:    subb %cl, %dl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    setl %dh
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    setl %bl
+; X86-NEXT:    subb %dh, %bl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl %edi, %ebp
+; X86-NEXT:    sbbl %esi, %ebp
 ; X86-NEXT:    setl %dh
-; X86-NEXT:    cmpl %esi, %ebx
-; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    sbbl %edi, %esi
 ; X86-NEXT:    setl %cl
 ; X86-NEXT:    subb %dh, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmpl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    setl %dh
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    setl %bl
-; X86-NEXT:    subb %dh, %bl
+; X86-NEXT:    setl %ch
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    setl %dl
+; X86-NEXT:    subb %ch, %dl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movb %bl, 15(%eax)
+; X86-NEXT:    movb %dl, 15(%eax)
 ; X86-NEXT:    movb %cl, 14(%eax)
-; X86-NEXT:    movb %dl, 13(%eax)
-; X86-NEXT:    movb %ch, 12(%eax)
+; X86-NEXT:    movb %bl, 13(%eax)
+; X86-NEXT:    movb %bh, 12(%eax)
 ; X86-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movb %cl, 11(%eax)
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -2221,7 +2221,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $52, %esp
+; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addb %al, %al
 ; X86-NEXT:    sarb %al
@@ -2329,46 +2329,45 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    setl %dl
 ; X86-NEXT:    setg %dh
 ; X86-NEXT:    subb %dl, %dh
-; X86-NEXT:    movsbl %dh, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %ebx, 96(%edi)
-; X86-NEXT:    movl %ebx, 92(%edi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 80(%edi)
+; X86-NEXT:    movsbl %dh, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 96(%edi)
+; X86-NEXT:    movl %edx, 92(%edi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 80(%edi)
 ; X86-NEXT:    movl %eax, 68(%edi)
 ; X86-NEXT:    movl %eax, 64(%edi)
 ; X86-NEXT:    movl %esi, 52(%edi)
 ; X86-NEXT:    movl %esi, 48(%edi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 36(%edi)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, 36(%edi)
 ; X86-NEXT:    movl %ebp, 24(%edi)
 ; X86-NEXT:    movl %ebp, 20(%edi)
 ; X86-NEXT:    movl %ecx, 8(%edi)
 ; X86-NEXT:    movl %ecx, 4(%edi)
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movw %cx, 100(%edi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $30, %edx, %ecx
+; X86-NEXT:    movw %dx, 100(%edi)
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $30, %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 88(%edi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $9, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $9, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $9, %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 76(%edi)
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $20, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $20, %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 60(%edi)
 ; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $31, %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 44(%edi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shldl $10, %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $10, %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    shldl $10, %ebx, %ecx
 ; X86-NEXT:    movl %ecx, 32(%edi)
 ; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -2384,17 +2383,17 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    andl $1048575, %esi # imm = 0xFFFFF
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    movl %esi, 56(%edi)
-; X86-NEXT:    shll $10, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shll $10, %eax
 ; X86-NEXT:    andl $1023, %ebp # imm = 0x3FF
-; X86-NEXT:    orl %edx, %ebp
+; X86-NEXT:    orl %eax, %ebp
 ; X86-NEXT:    movl %ebp, 28(%edi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    shll $21, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl $7, %eax
-; X86-NEXT:    movb %al, 102(%edi)
+; X86-NEXT:    andl $7, %edx
+; X86-NEXT:    movb %dl, 102(%edi)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    shll $30, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -2404,7 +2403,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, 40(%edi)
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    addl $52, %esp
+; X86-NEXT:    addl $48, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 4925f8bc6c8b06..36ebcf8a681cd5 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -273,13 +273,14 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    movq %rbx, %rdx
 ; X64-NEXT:    movq %r12, %rcx
 ; X64-NEXT:    callq __divti3 at PLT
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    leaq -1(%rax), %rbp
 ; X64-NEXT:    testq %r15, %r15
 ; X64-NEXT:    sets %al
 ; X64-NEXT:    testq %r12, %r12
-; X64-NEXT:    sets %r13b
-; X64-NEXT:    xorb %al, %r13b
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %al, %cl
+; X64-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq %r14, %rdi
 ; X64-NEXT:    movq %r15, %rsi
 ; X64-NEXT:    movq %rbx, %rdx
@@ -287,8 +288,8 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    callq __modti3 at PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    testb %r13b, %al
-; X64-NEXT:    cmoveq (%rsp), %rbp # 8-byte Folded Reload
+; X64-NEXT:    testb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Reload
+; X64-NEXT:    cmoveq %r13, %rbp
 ; X64-NEXT:    movq %rbp, %rax
 ; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
@@ -573,7 +574,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %edi
@@ -624,7 +625,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    calll __moddi3
 ; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
@@ -638,12 +639,11 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    sets %dl
 ; X86-NEXT:    xorb %cl, %dl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    testb %dl, %cl
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    cmovel %eax, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
@@ -654,30 +654,30 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %cl, %al
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    leal -1(%eax), %ecx
-; X86-NEXT:    cmovel %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal -1(%eax), %edi
+; X86-NEXT:    cmovel %eax, %edi
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %cl
 ; X86-NEXT:    xorb %al, %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %cl, %al
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    leal -1(%eax), %ebp
-; X86-NEXT:    cmovel %eax, %ebp
+; X86-NEXT:    leal -1(%eax), %ebx
+; X86-NEXT:    cmovel %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    sets %al
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %bl
-; X86-NEXT:    xorb %al, %bl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    leal -1(%edi), %esi
+; X86-NEXT:    sets %ah
+; X86-NEXT:    xorb %al, %ah
+; X86-NEXT:    movb %ah, (%esp) # 1-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    leal -1(%eax), %ebp
 ; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    pushl {{[0-9]+}}(%esp)
 ; X86-NEXT:    pushl %edx
@@ -686,15 +686,13 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %bl, %al
-; X86-NEXT:    cmovel %edi, %esi
+; X86-NEXT:    testb %al, (%esp) # 1-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    addl $60, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab6178c..ddb665202089c1 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -307,8 +307,7 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    subq $24, %rsp
-; X64-NEXT:    movq %rsi, %rdx
-; X64-NEXT:    movq %rsi, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    movq %rdi, %r14
 ; X64-NEXT:    leaq (%rdi,%rdi), %rax
 ; X64-NEXT:    movq %rdi, %r15
@@ -319,6 +318,7 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    sarq $63, %r12
 ; X64-NEXT:    movq %r14, %rdi
 ; X64-NEXT:    movq %r15, %rsi
+; X64-NEXT:    movq %rbx, %rdx
 ; X64-NEXT:    movq %r12, %rcx
 ; X64-NEXT:    callq __divti3 at PLT
 ; X64-NEXT:    movq %rax, %r13
@@ -330,16 +330,17 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X64-NEXT:    testq %r15, %r15
 ; X64-NEXT:    sets %al
 ; X64-NEXT:    testq %r12, %r12
-; X64-NEXT:    sets %bl
-; X64-NEXT:    xorb %al, %bl
+; X64-NEXT:    sets %cl
+; X64-NEXT:    xorb %al, %cl
+; X64-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; X64-NEXT:    movq %r14, %rdi
 ; X64-NEXT:    movq %r15, %rsi
-; X64-NEXT:    movq (%rsp), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rbx, %rdx
 ; X64-NEXT:    movq %r12, %rcx
 ; X64-NEXT:    callq __modti3 at PLT
 ; X64-NEXT:    orq %rax, %rdx
 ; X64-NEXT:    setne %al
-; X64-NEXT:    testb %bl, %al
+; X64-NEXT:    testb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Reload
 ; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
 ; X64-NEXT:    cmoveq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
 ; X64-NEXT:    movq %rbp, %rcx
@@ -805,7 +806,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $208, %esp
+; X86-NEXT:    subl $192, %esp
 ; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    movl 16(%ebp), %ebx
 ; X86-NEXT:    movl 32(%ebp), %eax
@@ -837,7 +838,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    leal (%ecx,%ecx), %edx
 ; X86-NEXT:    shrl $31, %ecx
 ; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    pushl %esi
@@ -888,7 +889,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-NEXT:    shrl $31, %ecx
 ; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %ebx
@@ -901,27 +902,28 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll __modti3
 ; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    leal (%ecx,%ecx), %esi
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    shldl $31, %esi, %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 40(%ebp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    pushl 40(%ebp)
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl $0
+; X86-NEXT:    pushl %ecx
 ; X86-NEXT:    calll __divti3
 ; X86-NEXT:    addl $32, %esp
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -939,31 +941,32 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bl
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    sets %bh
 ; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %edi, %ebx
 ; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    testb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
+; X86-NEXT:    cmovel (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    xorl %ebx, %ebx
@@ -991,119 +994,117 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %edi
 ; X86-NEXT:    shldl $31, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %bl
+; X86-NEXT:    sets %ch
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %ch, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    testb %bh, %cl
+; X86-NEXT:    testb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %eax
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    cmovgel %ecx, %edx
-; X86-NEXT:    cmovgel %ecx, %edi
+; X86-NEXT:    cmovgel %ecx, %ebx
 ; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    cmovgel %ebx, %eax
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    cmovgel %edi, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmovgel %ebx, %esi
+; X86-NEXT:    cmovgel %edi, %esi
 ; X86-NEXT:    shldl $31, %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %bl
+; X86-NEXT:    sets %ch
 ; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %ch, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    setne %cl
-; X86-NEXT:    testb %bh, %cl
+; X86-NEXT:    testb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %eax
-; X86-NEXT:    movl %ebx, %ecx
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    cmovgel %ecx, %edx
-; X86-NEXT:    cmovgel %ecx, %edi
 ; X86-NEXT:    cmovgel %ecx, %ebx
-; X86-NEXT:    movl $-1, %esi
-; X86-NEXT:    cmovgel %esi, %eax
+; X86-NEXT:    cmovgel %ecx, %esi
+; X86-NEXT:    movl $-1, %edi
+; X86-NEXT:    cmovgel %edi, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    cmovgel %ecx, %eax
-; X86-NEXT:    cmovgel %esi, %ebx
-; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovgel %edi, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %ebx
@@ -1180,7 +1181,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd772ab7a3e..c0c229b0b172a7 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -278,6 +278,7 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, %ebx
 ; i686-NEXT:    andl $31, %ebx
+; i686-NEXT:    movl %ebx, (%esp) # 4-byte Spill
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
 ; i686-NEXT:    movl 40(%esp,%edx), %eax
@@ -291,7 +292,6 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl 44(%esp,%edx), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl %ebx, %esi
 ; i686-NEXT:    shrdl %cl, %edx, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -304,17 +304,19 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
 ; i686-NEXT:    movl 72(%esp,%edx), %ebx
-; i686-NEXT:    movl 68(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 68(%esp,%edx), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebx, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %ebx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 64(%esp,%edx), %edi
 ; i686-NEXT:    movl 76(%esp,%edx), %edx
 ; i686-NEXT:    shrdl %cl, %edx, %ebx
-; i686-NEXT:    movl %esi, %ecx
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    movl %eax, %ecx
@@ -541,7 +543,8 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%eax), %esi
+; i686-NEXT:    movl (%eax), %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 4(%eax), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 8(%eax), %eax
@@ -562,33 +565,35 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%ecx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl (%ecx), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 4(%ecx), %edi
 ; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 8(%ecx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 8(%ecx), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    andl $31, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    shldl %cl, %edi, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edi, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, %eax
+; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    shll %cl, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %ebx
 ; i686-NEXT:    movl 76(%esp,%ebx), %ebx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %esi, %ebx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    movl %edi, %esi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; i686-NEXT:    movl %eax, %ecx
 ; i686-NEXT:    shll %cl, %esi
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; i686-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    negl %edx
 ; i686-NEXT:    movl 108(%esp,%edx), %edx
diff --git a/llvm/test/CodeGen/X86/smul-with-overflow.ll b/llvm/test/CodeGen/X86/smul-with-overflow.ll
index da0e3fdc1a5272..db56602390de46 100644
--- a/llvm/test/CodeGen/X86/smul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/smul-with-overflow.ll
@@ -191,81 +191,86 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $188, %esp
+; X86-NEXT:    subl $192, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    andl $1, %ebp
-; X86-NEXT:    negl %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %bl, %esi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
 ; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %edx, %ebx
+; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edx, %edi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl %ebx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -311,18 +316,19 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edi
@@ -340,8 +346,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -349,17 +354,17 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl (%esp), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -376,414 +381,404 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    addl %ebx, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl %eax, %ebx
+; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ebx, %ecx
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %esi, %edi
+; X86-NEXT:    addl %ecx, %esi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    movzbl %al, %edi
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl $0, %ebp
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    addl %ebp, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl %eax, %ebp
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ebp, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edx, %ebp
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %eax, %ecx
-; X86-NEXT:    movzbl %bl, %esi
 ; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    setb %cl
+; X86-NEXT:    addl %eax, %esi
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edx, %eax
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    addl %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    addl %ebx, %edx
-; X86-NEXT:    movl %ebp, %esi
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl %ebp, %eax
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl %ebp, %ecx
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl (%esp), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %ebx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl %ecx, %eax
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl %al, %esi
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    adcl %ebx, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl %ebp, %esi
-; X86-NEXT:    setb %al
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movzbl %bl, %edx
+; X86-NEXT:    adcl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %eax, %eax
-; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    adcl %edx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl %esi, %edx
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    addl %edi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl %edx, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl %al, %ebp
-; X86-NEXT:    adcl %esi, %ebp
+; X86-NEXT:    adcl %edx, %ebp
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    adcl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ebp, %ebx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    addl %edx, %ecx
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    addl %ecx, %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movzbl %bl, %ebx
-; X86-NEXT:    adcl %esi, %ebx
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    addl %eax, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    adcl %eax, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebp, %edi
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    adcl %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    setb %bl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    adcl %ecx, %edi
+; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    adcl %esi, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl %esi, %eax
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    adcl %esi, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    addl %eax, %ebx
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT:    movl %ebp, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ebp
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    orl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    orl %ebx, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl %edx, %eax
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    xorl %edi, %edx
 ; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    andl $1, %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebp, %ebx
 ; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    xorl %eax, %esi
@@ -805,7 +800,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X86-NEXT:    movl %edx, 12(%eax)
 ; X86-NEXT:    movb %cl, 16(%eax)
 ; X86-NEXT:    setne 32(%eax)
-; X86-NEXT:    addl $188, %esp
+; X86-NEXT:    addl $192, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -820,10 +815,10 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %r9, %r15
+; X64-NEXT:    movq %r9, %r14
 ; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq %rsi, %r12
+; X64-NEXT:    movq %rdx, %r12
+; X64-NEXT:    movq %rsi, %r15
 ; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; X64-NEXT:    andl $1, %r11d
@@ -839,7 +834,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    addq %rdx, %rbp
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    movq %r9, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    addq %rax, %rbp
@@ -848,25 +843,25 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    movzbl %sil, %edi
 ; X64-NEXT:    addq %rax, %rcx
 ; X64-NEXT:    adcq %rdx, %rdi
-; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %r10
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    movq %rax, %r13
 ; X64-NEXT:    addq %r10, %r13
 ; X64-NEXT:    adcq $0, %rbx
-; X64-NEXT:    movq %r12, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    addq %r13, %rax
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; X64-NEXT:    adcq %rbx, %rsi
 ; X64-NEXT:    setb %r8b
-; X64-NEXT:    movq %r14, %rax
-; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %r14
 ; X64-NEXT:    movq %rdx, %rbx
 ; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    movzbl %r8b, %edx
@@ -877,11 +872,11 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    adcq $0, %rcx
 ; X64-NEXT:    adcq $0, %rdi
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r12
+; X64-NEXT:    mulq %r15
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rax, %r15
 ; X64-NEXT:    movq %r11, %rax
-; X64-NEXT:    mulq %r14
+; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    movq %rax, %r8
 ; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -913,14 +908,13 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    adcq %rdx, %rdi
 ; X64-NEXT:    setb %bl
 ; X64-NEXT:    addq %rax, %rdi
-; X64-NEXT:    movzbl %bl, %esi
-; X64-NEXT:    adcq %rdx, %rsi
+; X64-NEXT:    movzbl %bl, %ebx
+; X64-NEXT:    adcq %rdx, %rbx
 ; X64-NEXT:    addq %rax, %rbp
 ; X64-NEXT:    adcq %r12, %r10
 ; X64-NEXT:    movzbl %cl, %eax
 ; X64-NEXT:    adcq %rax, %rdi
-; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    adcq $0, %rbx
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    movq %rsi, %r8
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@@ -928,8 +922,8 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    adcq $0, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; X64-NEXT:    addq %rbx, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; X64-NEXT:    addq %r12, %r8
 ; X64-NEXT:    adcq %rax, %rcx
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    addq %rsi, %rcx
@@ -937,7 +931,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    adcq %rdx, %rsi
 ; X64-NEXT:    movq %r9, %rax
 ; X64-NEXT:    imulq %r11
-; X64-NEXT:    movq %rbx, %r11
+; X64-NEXT:    movq %r12, %r11
 ; X64-NEXT:    addq %rax, %r11
 ; X64-NEXT:    movq %r8, %r12
 ; X64-NEXT:    adcq %rdx, %r12
@@ -950,9 +944,9 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    addq %rcx, %r9
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; X64-NEXT:    adcq %rsi, %r13
-; X64-NEXT:    setb %bl
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
 ; X64-NEXT:    addq %rcx, %r13
-; X64-NEXT:    movzbl %bl, %ecx
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X64-NEXT:    adcq %rsi, %rcx
 ; X64-NEXT:    addq %r15, %rax
 ; X64-NEXT:    adcq %r9, %rdx
@@ -965,7 +959,7 @@ define { i129, i1 } @smul_ovf(i129 %x, i129 %y) nounwind {
 ; X64-NEXT:    addq %rbp, %r15
 ; X64-NEXT:    adcq %r10, %r9
 ; X64-NEXT:    adcq %rdi, %rax
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; X64-NEXT:    adcq %rbx, %rdx
 ; X64-NEXT:    movq %r14, %rcx
 ; X64-NEXT:    sarq $63, %rcx
 ; X64-NEXT:    xorq %rcx, %rdx
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index ce56283df6010b..582d1eefa88f55 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -369,24 +369,26 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull %ebx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll
index 85c966c447fad6..5237a4d4fde00e 100644
--- a/llvm/test/CodeGen/X86/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll
@@ -61,30 +61,27 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    imull %esi
+; X86-NEXT:    imull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
@@ -101,18 +98,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    cmovnsl %ebp, %edx
 ; X86-NEXT:    cmovnsl %edi, %ecx
 ; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setg %ah
-; X86-NEXT:    sete (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setg %bh
+; X86-NEXT:    sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    cmpl $2, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    andb (%esp), %al # 1-byte Folded Reload
-; X86-NEXT:    orb %ah, %al
+; X86-NEXT:    setae %bl
+; X86-NEXT:    andb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; X86-NEXT:    orb %bh, %bl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shrdl $2, %ebx, %ebp
-; X86-NEXT:    shrdl $2, %ecx, %ebx
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    shrdl $2, %eax, %ebp
+; X86-NEXT:    shrdl $2, %ecx, %eax
+; X86-NEXT:    testb %bl, %bl
 ; X86-NEXT:    movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovel %ebx, %esi
+; X86-NEXT:    cmovel %eax, %esi
 ; X86-NEXT:    movl $-1, %edi
 ; X86-NEXT:    cmovel %ebp, %edi
 ; X86-NEXT:    cmpl $-1, %edx
@@ -411,8 +408,7 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    adcl %edi, %ebp
@@ -629,24 +625,26 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull %ebx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
@@ -716,31 +714,33 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    imull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    imull %ebx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    adcl %edx, %edi
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    addl %ecx, %edi
+; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subl %ecx, %edx
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    sbbl $0, %ebp
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    cmovnsl %ebx, %ebp
 ; X86-NEXT:    cmovnsl %edi, %edx
 ; X86-NEXT:    movl %edx, %esi
diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
index 816633b5b18ab8..e84978c54f936f 100644
--- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll
@@ -192,8 +192,7 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -220,36 +219,36 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %ebp
+; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl %ebx, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %ebx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    imull %ebx, %esi
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    addl %eax, %edx
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    adcl %ebp, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebp, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebx
@@ -339,22 +338,22 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    .cfi_offset %r14, -32
 ; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
-; X64-NEXT:    movq %rcx, %r13
-; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rcx, %r15
+; X64-NEXT:    movq %rdx, %r12
 ; X64-NEXT:    movq %rsi, %r10
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r11
-; X64-NEXT:    movq %r13, %rax
-; X64-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    mulq %r8
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %r14
 ; X64-NEXT:    addq %rcx, %r14
 ; X64-NEXT:    adcq $0, %rsi
-; X64-NEXT:    movq %r15, %rax
+; X64-NEXT:    movq %r12, %rax
 ; X64-NEXT:    movq %r9, %rcx
 ; X64-NEXT:    mulq %r9
 ; X64-NEXT:    movq %rdx, %r12
@@ -363,7 +362,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    adcq %rsi, %r12
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movzbl %al, %r9d
-; X64-NEXT:    movq %r13, %rax
+; X64-NEXT:    movq %r15, %rax
 ; X64-NEXT:    mulq %rcx
 ; X64-NEXT:    movq %rcx, %r14
 ; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
@@ -437,15 +436,15 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    adcq $0, %r11
 ; X64-NEXT:    addq %rsi, %r9
 ; X64-NEXT:    adcq %rcx, %r11
-; X64-NEXT:    setb %bl
+; X64-NEXT:    setb %r13b
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 ## 8-byte Reload
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rax, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp ## 8-byte Reload
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %rsi
 ; X64-NEXT:    movq %rdx, %rsi
 ; X64-NEXT:    movq %rax, %rdi
@@ -454,25 +453,26 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    movq %r10, %rax
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    addq %rdi, %rax
-; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    movq %rax, %rbp
+; X64-NEXT:    addq %rdi, %rbp
 ; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    setb %cl
-; X64-NEXT:    movq %rbp, %rax
+; X64-NEXT:    movq %rbx, %rsi
+; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    mulq %r12
-; X64-NEXT:    movq %rdx, %r13
-; X64-NEXT:    movq %rax, %r15
-; X64-NEXT:    addq %r10, %r15
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq %rax, %rbx
+; X64-NEXT:    addq %r10, %rbx
 ; X64-NEXT:    movzbl %cl, %eax
-; X64-NEXT:    adcq %rax, %r13
+; X64-NEXT:    adcq %rax, %r15
 ; X64-NEXT:    addq %r9, %r14
 ; X64-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    adcq %r11, %rdi
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; X64-NEXT:    movzbl %bl, %eax
-; X64-NEXT:    adcq %rax, %r15
-; X64-NEXT:    adcq $0, %r13
-; X64-NEXT:    movq %rbp, %rdi
+; X64-NEXT:    adcq %r11, %rbp
+; X64-NEXT:    movzbl %r13b, %eax
+; X64-NEXT:    adcq %rax, %rbx
+; X64-NEXT:    adcq $0, %r15
+; X64-NEXT:    movq %rsi, %rdi
+; X64-NEXT:    movq %rsi, %r13
 ; X64-NEXT:    sarq $63, %rdi
 ; X64-NEXT:    movq %r8, %rax
 ; X64-NEXT:    mulq %rdi
@@ -487,20 +487,19 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    movq %rdx, %r9
 ; X64-NEXT:    adcq $0, %r9
 ; X64-NEXT:    addq %rsi, %r11
-; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; X64-NEXT:    adcq %r10, %r9
-; X64-NEXT:    setb %sil
+; X64-NEXT:    setb %r10b
 ; X64-NEXT:    movq %rdi, %r8
 ; X64-NEXT:    imulq %r12, %r8
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    mulq {{[0-9]+}}(%rsp)
 ; X64-NEXT:    addq %r8, %rdx
 ; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    addq %rbx, %rax
+; X64-NEXT:    addq %rsi, %rax
 ; X64-NEXT:    adcq %r11, %rdx
 ; X64-NEXT:    addq %r14, %r9
-; X64-NEXT:    movzbl %sil, %esi
+; X64-NEXT:    movzbl %r10b, %esi
 ; X64-NEXT:    adcq %rcx, %rsi
 ; X64-NEXT:    addq %rax, %r9
 ; X64-NEXT:    adcq %rdx, %rsi
@@ -518,16 +517,16 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    movq %rax, %r10
 ; X64-NEXT:    addq %rax, %r14
 ; X64-NEXT:    adcq %rdx, %rdi
-; X64-NEXT:    setb %bl
-; X64-NEXT:    imulq %r12, %rbp
+; X64-NEXT:    setb {{[-0-9]+}}(%r{{[sb]}}p) ## 1-byte Folded Spill
+; X64-NEXT:    imulq %r12, %r13
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
 ; X64-NEXT:    mulq %r12
 ; X64-NEXT:    addq %rax, %rdx
-; X64-NEXT:    addq %rbp, %rdx
+; X64-NEXT:    addq %r13, %rdx
 ; X64-NEXT:    addq %rcx, %rax
 ; X64-NEXT:    adcq %r14, %rdx
 ; X64-NEXT:    addq %r10, %rdi
-; X64-NEXT:    movzbl %bl, %r10d
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d ## 1-byte Folded Reload
 ; X64-NEXT:    adcq %r8, %r10
 ; X64-NEXT:    addq %rax, %rdi
 ; X64-NEXT:    adcq %rdx, %r10
@@ -536,9 +535,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X64-NEXT:    adcq %r9, %rdi
 ; X64-NEXT:    adcq %rsi, %r10
 ; X64-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Folded Reload
-; X64-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Folded Reload
-; X64-NEXT:    adcq %r15, %rdi
-; X64-NEXT:    adcq %r13, %r10
+; X64-NEXT:    adcq %rbp, %r14
+; X64-NEXT:    adcq %rbx, %rdi
+; X64-NEXT:    adcq %r15, %r10
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx ## 8-byte Reload
 ; X64-NEXT:    movq %rdx, %rax
 ; X64-NEXT:    sarq $63, %rax
@@ -599,8 +598,8 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -609,9 +608,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -622,25 +621,25 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %ebp, %edi
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, %ecx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
@@ -656,7 +655,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -667,12 +666,12 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebp
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %ecx, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
@@ -685,8 +684,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
@@ -701,7 +699,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebp, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -729,7 +727,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -746,22 +744,23 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %ebp, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -775,8 +774,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -793,14 +791,14 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
@@ -808,8 +806,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
@@ -819,23 +816,22 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    adcl %edi, %edx
+; X86-NEXT:    movl %ebx, %esi
+; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %eax
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
@@ -846,27 +842,27 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl %ecx, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -897,7 +893,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
@@ -918,17 +914,17 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %esi
+; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    adcl (%esp), %esi ## 4-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -942,45 +938,42 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    addl %edi, %ecx
-; X86-NEXT:    movzbl %bl, %eax
-; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl (%esp), %edi ## 4-byte Reload
-; X86-NEXT:    adcl %esi, %edi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movzbl %cl, %ecx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
+; X86-NEXT:    adcl %ecx, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    movl %edi, (%esp) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
@@ -997,20 +990,20 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %esi, %edi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    addl %edi, %ebp
 ; X86-NEXT:    adcl %ecx, %esi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1024,23 +1017,24 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %edi
-; X86-NEXT:    setb %bl
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
-; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ecx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %ebp, %ecx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    mull %edi
@@ -1053,8 +1047,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
@@ -1072,7 +1065,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    adcl (%esp), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1086,8 +1079,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %esi, %ebx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, %ebx
@@ -1107,7 +1099,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
@@ -1128,70 +1120,72 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ecx, (%esp) ## 4-byte Spill
 ; X86-NEXT:    addl %ecx, %eax
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    adcl $0, %ebx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %ecx, %ebx
+; X86-NEXT:    adcl %ecx, %ebp
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %ebx
+; X86-NEXT:    addl %ebx, %ebp
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull %esi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    addl %esi, %ecx
+; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl %ebp, %edi
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %edi, %ecx
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    adcl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    addl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    adcl $0, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    addl %ebp, %edi
+; X86-NEXT:    addl %eax, %edi
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    addl (%esp), %edi ## 4-byte Folded Reload
+; X86-NEXT:    addl %ebx, %edi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    adcl %ebp, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
@@ -1220,53 +1214,56 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    adcl %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edx, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    adcl %ebx, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl %edx, %eax
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 1-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 1-byte Folded Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    adcl %eax, %ecx
+; X86-NEXT:    adcl (%esp), %eax ## 4-byte Folded Reload
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    adcl %ebp, %edx
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
+; X86-NEXT:    adcl %ebp, %edx
+; X86-NEXT:    movl %esi, %ebp
+; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %ecx
@@ -1278,78 +1275,75 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    setb (%esp) ## 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movzbl (%esp), %ebp ## 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %edx, %ebp
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    adcl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    addl %eax, %edi
-; X86-NEXT:    adcl %edx, %esi
-; X86-NEXT:    setb %cl
-; X86-NEXT:    addl %eax, %esi
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    adcl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; X86-NEXT:    adcl %edi, (%esp) ## 4-byte Folded Spill
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    adcl %edx, %ecx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movzbl %dl, %edx
+; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
-; X86-NEXT:    addl %ebx, %ecx
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    setb %al
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    adcl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    adcl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    adcl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    adcl $0, %edx
+; X86-NEXT:    addl %ebx, %esi
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
+; X86-NEXT:    setb %bl
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl %edi, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movzbl %bl, %edx
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    movl %ecx, %ebx
 ; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
-; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
+; X86-NEXT:    addl %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) ## 1-byte Folded Spill
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    adcl %edi, %ebp
-; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:    adcl %edx, %ebp
+; X86-NEXT:    addl %edi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 1-byte Folded Reload
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl %eax, %ebx
@@ -1385,7 +1379,7 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    adcl %edx, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
 ; X86-NEXT:    addl %eax, %esi
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
@@ -1395,26 +1389,26 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; X86-NEXT:    movl (%esp), %eax ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %esi ## 4-byte Reload
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; X86-NEXT:    movl %esi, (%esp) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Folded Reload
-; X86-NEXT:    movl %eax, (%esp) ## 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
@@ -1427,11 +1421,11 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) {
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    xorl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; X86-NEXT:    movl (%esp), %edx ## 4-byte Reload
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl (%esp), %ecx ## 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    xorl %eax, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll
index c8df7a233d7e3f..15424b7cc2be7b 100644
--- a/llvm/test/CodeGen/X86/sse-regcall4.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -200,20 +200,20 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %ecx, %edi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    leal (%eax,%esi), %ecx
 ; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN32-NEXT:    movl %eax, %ebx
 ; WIN32-NEXT:    subl %esi, %ebx
 ; WIN32-NEXT:    movl %edi, %eax
 ; WIN32-NEXT:    subl %edx, %eax
-; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    imull %eax, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imull %eax, %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN32-NEXT:    movl %edx, %esi
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    imull %ebx, %esi
-; WIN32-NEXT:    addl %ebp, %esi
+; WIN32-NEXT:    addl %ecx, %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl %ebp, %ebx
 ; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index f91758b861b4c4..d7b172f6614556 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -71,24 +71,24 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    subl $20, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    shldl %cl, %edx, %edi
+; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %eax, %edi
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sarl %cl, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovel %ebx, %eax
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sarl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    cmovel %eax, %esi
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %eax
@@ -107,8 +107,9 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb $32, %ch
 ; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shrdl %cl, %edi, %eax
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    cmovnel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
@@ -119,13 +120,13 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    cmovnel %esi, %edi
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    xorl %esi, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %ecx
 ; X86-NEXT:    xorl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    orl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    notl %esi
-; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edi
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index 787a33aa49b20e..9cf647f8100cdd 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -372,8 +372,8 @@ define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movl %r9d, %ebp
-; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %r9d, %ebx
+; CHECK-NEXT:    movl %r8d, %ebp
 ; CHECK-NEXT:    movl %ecx, %r14d
 ; CHECK-NEXT:    movl %edx, %r15d
 ; CHECK-NEXT:    movl %esi, %r12d
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index 5c26e29dce45ed..cbbdae1616fe8a 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -484,8 +484,8 @@ define void @test10(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movl %r9d, %ebp
-; CHECK-NEXT:    movl %r8d, %ebx
+; CHECK-NEXT:    movl %r9d, %ebx
+; CHECK-NEXT:    movl %r8d, %ebp
 ; CHECK-NEXT:    movl %ecx, %r14d
 ; CHECK-NEXT:    movl %edx, %r15d
 ; CHECK-NEXT:    movl %esi, %r12d
diff --git a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
index 0594f2fbc0a35f..db7aa99f5d1bfb 100644
--- a/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
+++ b/llvm/test/CodeGen/X86/statepoint-vreg-unlimited-tied-opnds.ll
@@ -34,87 +34,92 @@ define i32 @test_spill(
   ; CHECK-VREG-NEXT:   [[MOV64rm11]]:gr64, [[MOV64rm10]]:gr64, [[MOV64rm9]]:gr64, [[MOV64rm8]]:gr64, [[MOV64rm7]]:gr64, [[MOV64rm6]]:gr64, [[MOV64rm5]]:gr64, [[MOV64rm4]]:gr64, [[MOV64rm3]]:gr64, [[MOV64rm2]]:gr64, [[MOV64rm1]]:gr64, [[MOV64rm]]:gr64, [[COPY]]:gr64, [[COPY1]]:gr64, [[COPY2]]:gr64, [[COPY3]]:gr64, [[COPY4]]:gr64, [[COPY5]]:gr64 = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, [[MOV64rm11]](tied-def 0), [[MOV64rm10]](tied-def 1), [[MOV64rm9]](tied-def 2), [[MOV64rm8]](tied-def 3), [[MOV64rm7]](tied-def 4), [[MOV64rm6]](tied-def 5), [[MOV64rm5]](tied-def 6), [[MOV64rm4]](tied-def 7), [[MOV64rm3]](tied-def 8), [[MOV64rm2]](tied-def 9), [[MOV64rm1]](tied-def 10), [[MOV64rm]](tied-def 11), [[COPY]](tied-def 12), [[COPY1]](tied-def 13), [[COPY2]](tied-def 14), [[COPY3]](tied-def 15), [[COPY4]](tied-def 16), [[COPY5]](tied-def 17), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp
   ; CHECK-VREG-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY5]], 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm]], [[COPY4]], 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY3]], 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY2]], 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY1]], 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[COPY]], 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm]], 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm1]], 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm2]], 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm3]], 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm4]], 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm5]], 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm6]], 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm7]], 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm8]], 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm9]], 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm10]], 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
-  ; CHECK-VREG-NEXT:   [[ADD32rm1:%[0-9]+]]:gr32 = ADD32rm [[ADD32rm1]], [[MOV64rm11]], 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
-  ; CHECK-VREG-NEXT:   $eax = COPY [[ADD32rm1]]
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY4]], 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY3]], 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY2]], 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY1]], 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[COPY]], 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm]], 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm1]], 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm2]], 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm3]], 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm4]], 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm5]], 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm6]], 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm7]], 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm8]], 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm9]], 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm10]], 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
+  ; CHECK-VREG-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = ADD32rm [[MOV32rm]], [[MOV64rm11]], 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
+  ; CHECK-VREG-NEXT:   $eax = COPY [[MOV32rm]]
   ; CHECK-VREG-NEXT:   RET 0, killed $eax
+  ;
   ; CHECK-PREG-LABEL: name: test_spill
   ; CHECK-PREG: bb.0 (%ir-block.0):
   ; CHECK-PREG-NEXT:   liveins: $rcx, $rdi, $rdx, $rsi, $r8, $r9
   ; CHECK-PREG-NEXT: {{  $}}
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.2, 1, $noreg, 0, $noreg, $r9 :: (store (s64) into %stack.2)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.6, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.6)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.9, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.9)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.10, 1, $noreg, 0, $noreg, $rdx :: (store (s64) into %stack.10)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.11, 1, $noreg, 0, $noreg, $rsi :: (store (s64) into %stack.11)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.7, 1, $noreg, 0, $noreg, $r9 :: (store (s64) into %stack.7)
+  ; CHECK-PREG-NEXT:   renamable $r14 = COPY $r8
+  ; CHECK-PREG-NEXT:   renamable $r15 = COPY $rcx
+  ; CHECK-PREG-NEXT:   renamable $r12 = COPY $rdx
+  ; CHECK-PREG-NEXT:   renamable $r13 = COPY $rsi
   ; CHECK-PREG-NEXT:   renamable $rbp = COPY $rdi
-  ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.8, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.8)
+  ; CHECK-PREG-NEXT:   renamable $rbx = MOV64rm %fixed-stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.11, align 16)
   ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.10)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.7, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.7)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.11, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.11)
   ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.9, align 16)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.5)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.10, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.10)
   ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.8)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.4, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.4)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.9, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.9)
   ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.7, align 16)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.3, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.3)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.8, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.8)
   ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.6)
-  ; CHECK-PREG-NEXT:   MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.1)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.6, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.6)
   ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.5, align 16)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.5)
+  ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.4, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.4)
+  ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.3, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.3)
+  ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.2, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.2)
+  ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16)
+  ; CHECK-PREG-NEXT:   MOV64mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.1)
+  ; CHECK-PREG-NEXT:   renamable $rax = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0)
   ; CHECK-PREG-NEXT:   MOV64mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
-  ; CHECK-PREG-NEXT:   renamable $r13 = MOV64rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.4)
-  ; CHECK-PREG-NEXT:   renamable $r12 = MOV64rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
-  ; CHECK-PREG-NEXT:   renamable $r15 = MOV64rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
-  ; CHECK-PREG-NEXT:   renamable $rbx = MOV64rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.1, align 16)
-  ; CHECK-PREG-NEXT:   renamable $r14 = MOV64rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.0)
   ; CHECK-PREG-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
-  ; CHECK-PREG-NEXT:   renamable $r14, renamable $rbx, renamable $r15, renamable $r12, renamable $r13, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, killed renamable $r14(tied-def 0), killed renamable $rbx(tied-def 1), killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r13(tied-def 4), 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.7, 0, 1, 8, %stack.8, 0, 1, 8, %stack.2, 0, 1, 8, %stack.6, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11)
+  ; CHECK-PREG-NEXT:   renamable $rbx, renamable $r14, renamable $r15, renamable $r12, renamable $r13, renamable $rbp = STATEPOINT 0, 0, 0, @func, 2, 0, 2, 0, 2, 0, 2, 18, 1, 8, %stack.0, 0, 1, 8, %stack.1, 0, 1, 8, %stack.2, 0, 1, 8, %stack.3, 0, 1, 8, %stack.4, 0, 1, 8, %stack.5, 0, 1, 8, %stack.6, 0, 1, 8, %stack.8, 0, 1, 8, %stack.9, 0, 1, 8, %stack.10, 0, 1, 8, %stack.11, 0, killed renamable $rbx(tied-def 0), 1, 8, %stack.7, 0, killed renamable $r14(tied-def 1), killed renamable $r15(tied-def 2), killed renamable $r12(tied-def 3), killed renamable $r13(tied-def 4), killed renamable $rbp(tied-def 5), 2, 0, 2, 18, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, csr_64, implicit-def $rsp, implicit-def $ssp :: (load store (s64) on %stack.0), (load store (s64) on %stack.1), (load store (s64) on %stack.2), (load store (s64) on %stack.3), (load store (s64) on %stack.4), (load store (s64) on %stack.5), (load store (s64) on %stack.6), (load store (s64) on %stack.7), (load store (s64) on %stack.8), (load store (s64) on %stack.9), (load store (s64) on %stack.10), (load store (s64) on %stack.11)
   ; CHECK-PREG-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK-PREG-NEXT:   renamable $eax = MOV32rm killed renamable $rbp, 1, $noreg, 4, $noreg :: (load (s32) from %ir.gep00, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $rdi = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rdi, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
-  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r13, 1, $noreg, 8, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep01, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r12, 1, $noreg, 12, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep02, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r15, 1, $noreg, 16, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep03, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $r14, 1, $noreg, 20, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep04, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 24, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep05, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rbx, 1, $noreg, 28, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep06, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 32, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep07, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 36, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep08, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 40, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep09, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.8, 1, $noreg, 0, $noreg :: (load (s64) from %stack.8)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 44, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep10, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 48, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep11, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 52, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep12, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 56, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep13, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 60, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep14, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %stack.2)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 64, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep15, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.1, 1, $noreg, 0, $noreg :: (load (s64) from %stack.1)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 68, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep16, addrspace 1)
+  ; CHECK-PREG-NEXT:   renamable $rcx = MOV64rm %stack.0, 1, $noreg, 0, $noreg :: (load (s64) from %stack.0)
+  ; CHECK-PREG-NEXT:   renamable $eax = ADD32rm killed renamable $eax, killed renamable $rcx, 1, $noreg, 72, $noreg, implicit-def dead $eflags :: (load (s32) from %ir.gep17, addrspace 1)
   ; CHECK-PREG-NEXT:   RET 0, $eax
     ptr addrspace(1) %arg00, ptr addrspace(1) %arg01, ptr addrspace(1) %arg02, ptr addrspace(1) %arg03, ptr addrspace(1) %arg04, ptr addrspace(1) %arg05,
     ptr addrspace(1) %arg06, ptr addrspace(1) %arg07, ptr addrspace(1) %arg08, ptr addrspace(1) %arg09, ptr addrspace(1) %arg10, ptr addrspace(1) %arg11,
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index f1fd05565c47e9..08017ba1b07843 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec128_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec128_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec128_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec128_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rdx)
@@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec256_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec256_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec256_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec256_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec384_v3i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrb $2, %xmm0, 2(%rsi)
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    movw %ax, (%rsi)
@@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v3i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
 ; AVX512-NEXT:    vmovd %xmm0, (%rsi)
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
@@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec384_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec384_v6i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rsi)
 ; AVX512-NEXT:    vmovd %xmm0, (%rsi)
 ; AVX512-NEXT:    vpextrw $2, %xmm0, 4(%rdx)
@@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec384_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
@@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec512_v2i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec512_v2f32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
 ; AVX512-LABEL: vec512_v4i16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
 ; AVX512-LABEL: vec512_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = ~xmm0
 ; AVX512-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -6908,114 +6908,103 @@ define void @vec512_v16i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec
 ; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SCALAR-NEXT:    movl 28(%rdi), %eax
 ; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    movzwl 26(%rdi), %eax
-; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    movl 24(%rdi), %r13d
-; SCALAR-NEXT:    movzwl 22(%rdi), %r12d
-; SCALAR-NEXT:    movl 20(%rdi), %r15d
-; SCALAR-NEXT:    movzwl 18(%rdi), %r14d
+; SCALAR-NEXT:    movzwl 26(%rdi), %r13d
+; SCALAR-NEXT:    movl 24(%rdi), %r12d
+; SCALAR-NEXT:    movzwl 22(%rdi), %r15d
+; SCALAR-NEXT:    movl 20(%rdi), %r14d
+; SCALAR-NEXT:    movzwl 18(%rdi), %ebp
 ; SCALAR-NEXT:    movl 16(%rdi), %ebx
 ; SCALAR-NEXT:    movzwl 14(%rdi), %r11d
 ; SCALAR-NEXT:    movl 12(%rdi), %r10d
 ; SCALAR-NEXT:    movzwl 10(%rdi), %r9d
 ; SCALAR-NEXT:    movl 8(%rdi), %r8d
 ; SCALAR-NEXT:    movzwl 6(%rdi), %ecx
-; SCALAR-NEXT:    movzwl 2(%rdi), %ebp
+; SCALAR-NEXT:    movzwl 2(%rdi), %eax
+; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SCALAR-NEXT:    movl (%rdi), %eax
 ; SCALAR-NEXT:    movl 4(%rdi), %edi
 ; SCALAR-NEXT:    notl %eax
 ; SCALAR-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    notl %ebp
-; SCALAR-NEXT:    movl %ebp, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SCALAR-NEXT:    notl {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
 ; SCALAR-NEXT:    notl %edi
 ; SCALAR-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SCALAR-NEXT:    notl %ecx
-; SCALAR-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SCALAR-NEXT:    movl %ecx, %eax
 ; SCALAR-NEXT:    notl %r8d
 ; SCALAR-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SCALAR-NEXT:    notl %r9d
-; SCALAR-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    movl %r10d, %edi
-; SCALAR-NEXT:    notl %edi
-; SCALAR-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SCALAR-NEXT:    notl %r10d
 ; SCALAR-NEXT:    notl %r11d
-; SCALAR-NEXT:    movl %r11d, %r9d
 ; SCALAR-NEXT:    notl %ebx
-; SCALAR-NEXT:    movl %ebx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SCALAR-NEXT:    notl %ebp
 ; SCALAR-NEXT:    notl %r14d
 ; SCALAR-NEXT:    notl %r15d
 ; SCALAR-NEXT:    notl %r12d
-; SCALAR-NEXT:    notl %r13d
-; SCALAR-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
-; SCALAR-NEXT:    notl %r10d
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
-; SCALAR-NEXT:    notl %r11d
+; SCALAR-NEXT:    movl %r13d, %ecx
+; SCALAR-NEXT:    notl %ecx
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; SCALAR-NEXT:    notl %edi
 ; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
 ; SCALAR-NEXT:    notl %r8d
-; SCALAR-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; SCALAR-NEXT:    movw %r8w, 30(%rsi)
-; SCALAR-NEXT:    movw %r11w, 28(%rsi)
-; SCALAR-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    movw %r10w, 26(%rsi)
-; SCALAR-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SCALAR-NEXT:    movw %r13w, 24(%rsi)
-; SCALAR-NEXT:    movw %r12w, 22(%rsi)
-; SCALAR-NEXT:    movw %r15w, 20(%rsi)
-; SCALAR-NEXT:    movw %r14w, 18(%rsi)
+; SCALAR-NEXT:    movw %di, 28(%rsi)
+; SCALAR-NEXT:    movw %cx, 26(%rsi)
+; SCALAR-NEXT:    movw %r12w, 24(%rsi)
+; SCALAR-NEXT:    movw %r15w, 22(%rsi)
+; SCALAR-NEXT:    movw %r14w, 20(%rsi)
+; SCALAR-NEXT:    movw %bp, 18(%rsi)
 ; SCALAR-NEXT:    movw %bx, 16(%rsi)
-; SCALAR-NEXT:    movw %r9w, 14(%rsi)
-; SCALAR-NEXT:    movw %di, 12(%rsi)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Reload
-; SCALAR-NEXT:    movw %bp, 10(%rsi)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
-; SCALAR-NEXT:    movw %di, 8(%rsi)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; SCALAR-NEXT:    movw %cx, 6(%rsi)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
-; SCALAR-NEXT:    movw %r8w, 4(%rsi)
+; SCALAR-NEXT:    movw %r11w, 14(%rsi)
+; SCALAR-NEXT:    movw %r10w, 12(%rsi)
+; SCALAR-NEXT:    movw %r9w, 10(%rsi)
+; SCALAR-NEXT:    movl %r9d, %r13d
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; SCALAR-NEXT:    movw %r9w, 8(%rsi)
+; SCALAR-NEXT:    movw %ax, 6(%rsi)
+; SCALAR-NEXT:    movl %eax, %r9d
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SCALAR-NEXT:    movw %ax, 4(%rsi)
 ; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; SCALAR-NEXT:    movw %ax, 2(%rsi)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
-; SCALAR-NEXT:    movw %bx, (%rsi)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
-; SCALAR-NEXT:    movw %r13w, 30(%rdx)
-; SCALAR-NEXT:    movw %r11w, 28(%rdx)
-; SCALAR-NEXT:    movw %r10w, 26(%rdx)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
-; SCALAR-NEXT:    movw %si, 24(%rdx)
-; SCALAR-NEXT:    movw %r12w, 22(%rdx)
-; SCALAR-NEXT:    movw %r15w, 20(%rdx)
-; SCALAR-NEXT:    movw %r14w, 18(%rdx)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Reload
-; SCALAR-NEXT:    movw %r11w, 16(%rdx)
-; SCALAR-NEXT:    movw %r9w, 14(%rdx)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Reload
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SCALAR-NEXT:    movw %ax, (%rsi)
+; SCALAR-NEXT:    movw %r8w, 30(%rdx)
+; SCALAR-NEXT:    movw %di, 28(%rdx)
+; SCALAR-NEXT:    movw %cx, 26(%rdx)
+; SCALAR-NEXT:    movw %r12w, 24(%rdx)
+; SCALAR-NEXT:    movw %r15w, 22(%rdx)
+; SCALAR-NEXT:    movw %r14w, 20(%rdx)
+; SCALAR-NEXT:    movw %bp, 18(%rdx)
+; SCALAR-NEXT:    movw %bx, 16(%rdx)
+; SCALAR-NEXT:    movw %r11w, 14(%rdx)
 ; SCALAR-NEXT:    movw %r10w, 12(%rdx)
-; SCALAR-NEXT:    movw %bp, 10(%rdx)
-; SCALAR-NEXT:    movw %di, 8(%rdx)
-; SCALAR-NEXT:    movw %cx, 6(%rdx)
-; SCALAR-NEXT:    movw %r8w, 4(%rdx)
+; SCALAR-NEXT:    movw %r13w, 10(%rdx)
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SCALAR-NEXT:    movw %ax, 8(%rdx)
+; SCALAR-NEXT:    movw %r9w, 6(%rdx)
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SCALAR-NEXT:    movw %ax, 4(%rdx)
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; SCALAR-NEXT:    movw %ax, 2(%rdx)
-; SCALAR-NEXT:    movl %ebx, %esi
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Reload
 ; SCALAR-NEXT:    movw %si, (%rdx)
-; SCALAR-NEXT:    movw %r13w, 62(%rdx)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
-; SCALAR-NEXT:    movw %bx, 60(%rdx)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
-; SCALAR-NEXT:    movw %bx, 58(%rdx)
-; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Reload
-; SCALAR-NEXT:    movw %bx, 56(%rdx)
-; SCALAR-NEXT:    movw %r12w, 54(%rdx)
-; SCALAR-NEXT:    movw %r15w, 52(%rdx)
-; SCALAR-NEXT:    movw %r14w, 50(%rdx)
-; SCALAR-NEXT:    movw %r11w, 48(%rdx)
-; SCALAR-NEXT:    movw %r9w, 46(%rdx)
+; SCALAR-NEXT:    movw %r8w, 62(%rdx)
+; SCALAR-NEXT:    movw %di, 60(%rdx)
+; SCALAR-NEXT:    movw %cx, 58(%rdx)
+; SCALAR-NEXT:    movw %r12w, 56(%rdx)
+; SCALAR-NEXT:    movw %r15w, 54(%rdx)
+; SCALAR-NEXT:    movw %r14w, 52(%rdx)
+; SCALAR-NEXT:    movw %bp, 50(%rdx)
+; SCALAR-NEXT:    movw %bx, 48(%rdx)
+; SCALAR-NEXT:    movw %r11w, 46(%rdx)
 ; SCALAR-NEXT:    movw %r10w, 44(%rdx)
-; SCALAR-NEXT:    movw %bp, 42(%rdx)
-; SCALAR-NEXT:    movw %di, 40(%rdx)
-; SCALAR-NEXT:    movw %cx, 38(%rdx)
-; SCALAR-NEXT:    movw %r8w, 36(%rdx)
+; SCALAR-NEXT:    movw %r13w, 42(%rdx)
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SCALAR-NEXT:    movw %ax, 40(%rdx)
+; SCALAR-NEXT:    movw %r9w, 38(%rdx)
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; SCALAR-NEXT:    movw %ax, 36(%rdx)
+; SCALAR-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
 ; SCALAR-NEXT:    movw %ax, 34(%rdx)
 ; SCALAR-NEXT:    movw %si, 32(%rdx)
 ; SCALAR-NEXT:    popq %rbx
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 6a52acfe2fb305..12fbbabe4c4fe8 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -110,26 +110,27 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    setb %dl
 ; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -366,23 +367,24 @@ define i8 @ucmp_wide_op(i109 %x, i109 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    andl %ecx, %edx
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    setb %bl
 ; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -867,23 +869,24 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    sbbb $0, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    seta %bl
-; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movb %al, (%esp) # 1-byte Spill
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    seta %bh
-; X86-NEXT:    sbbb $0, %bh
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
 ; X86-NEXT:    sbbb $0, %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movsbl %al, %ebx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    seta %al
@@ -915,24 +918,23 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    movl %esi, 52(%eax)
 ; X86-NEXT:    movl %ebp, 48(%eax)
 ; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bh, %ecx
-; X86-NEXT:    movl %ecx, 36(%eax)
+; X86-NEXT:    movl %ebx, 40(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    movsbl (%esp), %edx # 1-byte Folded Reload
-; X86-NEXT:    movl %edx, 32(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    movsbl %bl, %edi
+; X86-NEXT:    movl %edx, 36(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movl %esi, 32(%eax)
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    movl %edi, 28(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ebx, 24(%eax)
 ; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl %edx, 16(%eax)
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %esi, 16(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
@@ -1519,10 +1521,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1550,235 +1552,236 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE4-NEXT:    andl $127, %r10d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE4-NEXT:    andl $127, %ebp
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    andl $127, %eax
 ; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    andl $127, %ecx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE4-NEXT:    andl $127, %r8d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE4-NEXT:    andl $127, %ebx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    andl $127, %edx
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; SSE4-NEXT:    andl $127, %r13d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    andl $127, %r11d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT:    andl $127, %ebx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE4-NEXT:    andl $127, %r8d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT:    andl $127, %r15d
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE4-NEXT:    andl $127, %r14d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    andl $127, %edx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    andl $127, %r12d
+; SSE4-NEXT:    cmpq %r10, %r12
+; SSE4-NEXT:    movq %rax, %r11
+; SSE4-NEXT:    sbbq %rdx, %r11
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    cmpq %r12, %r10
+; SSE4-NEXT:    sbbq %rax, %rdx
+; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE4-NEXT:    cmpq %rax, %rbp
-; SSE4-NEXT:    movq %r12, %r15
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq %r14, %r10
+; SSE4-NEXT:    sbbq %r15, %r10
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    cmpq %rdx, %rax
 ; SSE4-NEXT:    sbbq %r14, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rbp, %rax
-; SSE4-NEXT:    sbbq %r12, %r14
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq %r11, %r15
-; SSE4-NEXT:    sbbq %r13, %r15
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r11, %r13
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    cmpq %rax, %r11
-; SSE4-NEXT:    movq %rdx, %r14
-; SSE4-NEXT:    sbbq %rbx, %r14
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r11, %rax
-; SSE4-NEXT:    sbbq %rdx, %rbx
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq %r8, %r10
+; SSE4-NEXT:    sbbq %rbx, %r10
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %r8, %rbx
+; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; SSE4-NEXT:    cmpq %rax, %rdx
-; SSE4-NEXT:    movq %r8, %r11
-; SSE4-NEXT:    sbbq %rcx, %r11
-; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    movq %r13, %r8
+; SSE4-NEXT:    sbbq %rcx, %r8
+; SSE4-NEXT:    setb %r8b
 ; SSE4-NEXT:    cmpq %rdx, %rax
-; SSE4-NEXT:    sbbq %r8, %rcx
-; SSE4-NEXT:    sbbb $0, %r11b
-; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    sbbq %r13, %rcx
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    movq %r8, %rdx
-; SSE4-NEXT:    sbbq %r10, %rdx
+; SSE4-NEXT:    sbbq %rbp, %rdx
 ; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r8, %r10
+; SSE4-NEXT:    sbbq %r8, %rbp
 ; SSE4-NEXT:    sbbb $0, %dl
 ; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    sbbb $0, %dl
+; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    sbbq %r8, %rdx
 ; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
+; SSE4-NEXT:    sbbq %r10, %r8
 ; SSE4-NEXT:    sbbb $0, %dl
 ; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %bpl
+; SSE4-NEXT:    setb %r11b
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %bpl
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    sbbb $0, %r11b
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    movq %r10, %rdx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; SSE4-NEXT:    sbbq %r8, %rdx
 ; SSE4-NEXT:    setb %dl
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
+; SSE4-NEXT:    sbbq %r10, %r8
 ; SSE4-NEXT:    sbbb $0, %dl
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %r8
-; SSE4-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; SSE4-NEXT:    sbbq %rbx, %r8
-; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE4-NEXT:    movq %rbx, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    setb %r8b
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %rbx
-; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    sbbq %rbx, %r10
+; SSE4-NEXT:    sbbb $0, %r8b
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %rbx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rbx
-; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    movq (%rsp), %r14 # 8-byte Reload
+; SSE4-NEXT:    movq %r14, %r10
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE4-NEXT:    sbbq %rbx, %r10
+; SSE4-NEXT:    setb %r10b
 ; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %r8
-; SSE4-NEXT:    sbbb $0, %bl
+; SSE4-NEXT:    sbbq %r14, %rbx
+; SSE4-NEXT:    sbbb $0, %r10b
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT:    cmpq %rax, %rbx
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; SSE4-NEXT:    movq %r15, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rcx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %rcx
 ; SSE4-NEXT:    setb %cl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r15, %r8
+; SSE4-NEXT:    cmpq %rbx, %rax
+; SSE4-NEXT:    sbbq %r15, %r14
 ; SSE4-NEXT:    sbbb $0, %cl
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE4-NEXT:    cmpq %rax, %r15
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT:    cmpq %rax, %r14
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r14
-; SSE4-NEXT:    setb %r14b
-; SSE4-NEXT:    cmpq %r15, %rax
-; SSE4-NEXT:    sbbq %r12, %r8
-; SSE4-NEXT:    sbbb $0, %r14b
+; SSE4-NEXT:    movq %r12, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r15, %rbx
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    cmpq %r14, %rax
+; SSE4-NEXT:    sbbq %r12, %r15
+; SSE4-NEXT:    sbbb $0, %bl
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE4-NEXT:    cmpq %r9, %rax
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r15
-; SSE4-NEXT:    setb %r15b
+; SSE4-NEXT:    movq %r12, %r14
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r15, %r14
+; SSE4-NEXT:    setb %bpl
 ; SSE4-NEXT:    cmpq %rax, %r9
-; SSE4-NEXT:    sbbq %r12, %r8
-; SSE4-NEXT:    sbbb $0, %r15b
+; SSE4-NEXT:    sbbq %r12, %r15
+; SSE4-NEXT:    sbbb $0, %bpl
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    cmpq %r15, %rax
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    cmpq %r12, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE4-NEXT:    movq %r13, %r9
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r9
+; SSE4-NEXT:    movq %r12, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %r9
 ; SSE4-NEXT:    setb %r9b
-; SSE4-NEXT:    cmpq %rax, %r12
-; SSE4-NEXT:    sbbq %r13, %r8
-; SSE4-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT:    cmpq %rax, %r15
+; SSE4-NEXT:    sbbq %r12, %r14
+; SSE4-NEXT:    movq %rdi, %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE4-NEXT:    sbbb $0, %r9b
-; SSE4-NEXT:    cmpq %rsi, %r12
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rdi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rdi
+; SSE4-NEXT:    cmpq %rsi, %r14
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE4-NEXT:    movq %r12, %rdi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r15, %rdi
 ; SSE4-NEXT:    setb %dil
-; SSE4-NEXT:    cmpq %r12, %rsi
-; SSE4-NEXT:    sbbq %r8, %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE4-NEXT:    cmpq %r14, %rsi
+; SSE4-NEXT:    sbbq %r12, %r15
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; SSE4-NEXT:    sbbb $0, %dil
-; SSE4-NEXT:    cmpq %r12, %r13
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rsi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rsi
+; SSE4-NEXT:    cmpq %r14, %r15
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE4-NEXT:    movq %r13, %rsi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r12, %rsi
 ; SSE4-NEXT:    setb %sil
-; SSE4-NEXT:    cmpq %r13, %r12
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm1
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm2
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm3
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm4
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm5
-; SSE4-NEXT:    movzbl %r10b, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm6
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r10d, %xmm7
-; SSE4-NEXT:    movzbl %bpl, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm0
+; SSE4-NEXT:    cmpq %r15, %r14
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm1
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm2
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm3
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm4
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm5
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm6
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SSE4-NEXT:    movd %r14d, %xmm7
+; SSE4-NEXT:    movzbl %r11b, %r11d
+; SSE4-NEXT:    movd %r11d, %xmm0
 ; SSE4-NEXT:    movzbl %dl, %edx
 ; SSE4-NEXT:    movd %edx, %xmm8
-; SSE4-NEXT:    movzbl %r11b, %edx
+; SSE4-NEXT:    movzbl %r8b, %edx
 ; SSE4-NEXT:    movd %edx, %xmm9
-; SSE4-NEXT:    movzbl %bl, %edx
+; SSE4-NEXT:    movzbl %r10b, %edx
 ; SSE4-NEXT:    movd %edx, %xmm10
 ; SSE4-NEXT:    movzbl %cl, %ecx
 ; SSE4-NEXT:    movd %ecx, %xmm11
-; SSE4-NEXT:    movzbl %r14b, %ecx
+; SSE4-NEXT:    movzbl %bl, %ecx
 ; SSE4-NEXT:    movd %ecx, %xmm12
-; SSE4-NEXT:    movzbl %r15b, %ecx
+; SSE4-NEXT:    movzbl %bpl, %ecx
 ; SSE4-NEXT:    movd %ecx, %xmm13
 ; SSE4-NEXT:    movzbl %r9b, %ecx
 ; SSE4-NEXT:    movd %ecx, %xmm14
@@ -1798,12 +1801,11 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
 ; SSE4-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE4-NEXT:    sbbq %r8, %rax
+; SSE4-NEXT:    sbbq %r13, %r12
 ; SSE4-NEXT:    sbbb $0, %sil
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
 ; SSE4-NEXT:    movzbl %sil, %ecx
 ; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
 ; SSE4-NEXT:    movb %cl, 4(%rax)
 ; SSE4-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
@@ -1957,81 +1959,81 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    andl $127, %eax
 ; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT:    andl $127, %ebp
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    andl $127, %ecx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE2-NEXT:    andl $127, %r13d
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    andl $127, %ebx
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; SSE2-NEXT:    andl $127, %edx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    andl $127, %r10d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    andl $127, %r14d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    andl $127, %ebp
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT:    andl $127, %r13d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT:    andl $127, %r11d
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; SSE2-NEXT:    andl $127, %r15d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT:    cmpq %rax, %r12
-; SSE2-NEXT:    movq %r15, %r8
-; SSE2-NEXT:    sbbq %r11, %r8
-; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %r12, %rax
-; SSE2-NEXT:    sbbq %r15, %r11
-; SSE2-NEXT:    sbbb $0, %r8b
-; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE2-NEXT:    andl $127, %r14d
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %r13, %r11
-; SSE2-NEXT:    sbbq %rbp, %r11
+; SSE2-NEXT:    andl $127, %r8d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    andl $127, %r10d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE2-NEXT:    cmpq %rbx, %r12
+; SSE2-NEXT:    movq %r10, %r11
+; SSE2-NEXT:    sbbq %r8, %r11
 ; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %r13, %rbp
+; SSE2-NEXT:    cmpq %r12, %rbx
+; SSE2-NEXT:    sbbq %r10, %r8
 ; SSE2-NEXT:    sbbb $0, %r11b
 ; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    cmpq %r8, %r10
 ; SSE2-NEXT:    movq %r14, %r11
-; SSE2-NEXT:    sbbq %r10, %r11
+; SSE2-NEXT:    sbbq %r15, %r11
 ; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %r14, %r10
+; SSE2-NEXT:    cmpq %r10, %r8
+; SSE2-NEXT:    sbbq %r14, %r15
 ; SSE2-NEXT:    sbbb $0, %r11b
 ; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %rdx, %r10
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %rdx, %rbx
-; SSE2-NEXT:    sbbb $0, %r10b
-; SSE2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    cmpq %r8, %r10
+; SSE2-NEXT:    movq %rdx, %r11
+; SSE2-NEXT:    sbbq %rax, %r11
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    cmpq %r10, %r8
+; SSE2-NEXT:    sbbq %rdx, %rax
+; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; SSE2-NEXT:    cmpq %rax, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %r8
+; SSE2-NEXT:    movq %r13, %r8
 ; SSE2-NEXT:    sbbq %rcx, %r8
 ; SSE2-NEXT:    setb %r8b
 ; SSE2-NEXT:    cmpq %rdx, %rax
-; SSE2-NEXT:    sbbq %r10, %rcx
+; SSE2-NEXT:    sbbq %r13, %rcx
 ; SSE2-NEXT:    sbbb $0, %r8b
 ; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    movq %r8, %rdx
+; SSE2-NEXT:    sbbq %rbp, %rdx
+; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    sbbq %r8, %rbp
+; SSE2-NEXT:    sbbb $0, %dl
+; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    cmpq %rax, %rcx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; SSE2-NEXT:    movq %r10, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
@@ -2056,79 +2058,80 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT:    movq %r11, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    movq %r10, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r8, %rdx
+; SSE2-NEXT:    setb %dl
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r11, %r10
-; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    sbbb $0, %dl
+; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rdx
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    movq %r10, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r8, %rdx
+; SSE2-NEXT:    setb %dl
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    sbbb $0, %dl
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %r8
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    setb %r8b
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    sbbb $0, %dl
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %r8b
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    movq %r14, %r10
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    sbbq %rbx, %r10
+; SSE2-NEXT:    movq %rbx, %r10
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r11, %r10
 ; SSE2-NEXT:    setb %r10b
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r14, %rbx
+; SSE2-NEXT:    sbbq %rbx, %r11
 ; SSE2-NEXT:    sbbb $0, %r10b
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    cmpq %rax, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE2-NEXT:    movq %r14, %rcx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    sbbq %rbx, %rcx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    cmpq %r11, %rax
+; SSE2-NEXT:    sbbq %r14, %rbx
+; SSE2-NEXT:    sbbb $0, %cl
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; SSE2-NEXT:    cmpq %rax, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    movq %r15, %rcx
+; SSE2-NEXT:    movq (%rsp), %r15 # 8-byte Reload
+; SSE2-NEXT:    movq %r15, %r11
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r14, %rcx
-; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    sbbq %r14, %r11
+; SSE2-NEXT:    setb %r11b
 ; SSE2-NEXT:    cmpq %rbx, %rax
 ; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    sbbb $0, %cl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    cmpq %rax, %r14
-; SSE2-NEXT:    movq (%rsp), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r15, %rbx
-; SSE2-NEXT:    setb %bl
-; SSE2-NEXT:    cmpq %r14, %rax
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    sbbb $0, %bl
+; SSE2-NEXT:    sbbb $0, %r11b
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    cmpq %r9, %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %r14
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    movq %r15, %rbx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r14, %rbx
+; SSE2-NEXT:    setb %bl
 ; SSE2-NEXT:    cmpq %rax, %r9
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    sbbb $0, %bpl
+; SSE2-NEXT:    sbbq %r15, %r14
+; SSE2-NEXT:    sbbb $0, %bl
 ; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    cmpq %rsi, %rax
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
@@ -2158,11 +2161,11 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    movq %r13, %r15
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    setb %r15b
+; SSE2-NEXT:    setb %bpl
 ; SSE2-NEXT:    cmpq %r14, %rsi
 ; SSE2-NEXT:    sbbq %r13, %r12
-; SSE2-NEXT:    sbbb $0, %r15b
-; SSE2-NEXT:    movzbl %r15b, %esi
+; SSE2-NEXT:    sbbb $0, %bpl
+; SSE2-NEXT:    movzbl %bpl, %esi
 ; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    movb %sil, 4(%rax)
 ; SSE2-NEXT:    movzbl %dil, %esi
@@ -2170,11 +2173,11 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    andl $3, %edi
 ; SSE2-NEXT:    leaq (%rdi,%rsi,4), %rsi
-; SSE2-NEXT:    movzbl %bpl, %edi
+; SSE2-NEXT:    movzbl %bl, %edi
 ; SSE2-NEXT:    andl $3, %edi
 ; SSE2-NEXT:    shll $4, %edi
 ; SSE2-NEXT:    orq %rsi, %rdi
-; SSE2-NEXT:    movzbl %bl, %r9d
+; SSE2-NEXT:    movzbl %r11b, %r9d
 ; SSE2-NEXT:    andl $3, %r9d
 ; SSE2-NEXT:    shll $6, %r9d
 ; SSE2-NEXT:    orq %rdi, %r9
@@ -2182,18 +2185,18 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    shll $8, %esi
 ; SSE2-NEXT:    orq %r9, %rsi
-; SSE2-NEXT:    movzbl %dl, %ecx
-; SSE2-NEXT:    movzbl %r10b, %edx
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $10, %edx
+; SSE2-NEXT:    movzbl %r8b, %ecx
+; SSE2-NEXT:    movzbl %r10b, %edi
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $10, %edi
 ; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    movzbl %r11b, %edx
+; SSE2-NEXT:    orq %rdi, %rcx
+; SSE2-NEXT:    movzbl %dl, %edx
 ; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    shll $14, %edx
 ; SSE2-NEXT:    orq %rcx, %rdx
-; SSE2-NEXT:    movzbl %r8b, %ecx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    shll $16, %ecx
 ; SSE2-NEXT:    orq %rdx, %rcx
@@ -2321,62 +2324,62 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    andl $127, %eax
 ; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    andl $127, %r15d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX2-NEXT:    andl $127, %r14d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    andl $127, %edx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX2-NEXT:    andl $127, %ebp
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX2-NEXT:    andl $127, %r8d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    andl $127, %r12d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    andl $127, %r10d
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; AVX2-NEXT:    andl $127, %r13d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    andl $127, %r12d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    andl $127, %edx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    andl $127, %r14d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    andl $127, %r15d
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX2-NEXT:    cmpq %rbx, %r11
-; AVX2-NEXT:    movq %r13, %r10
-; AVX2-NEXT:    sbbq %r12, %r10
-; AVX2-NEXT:    setb %r10b
+; AVX2-NEXT:    movq %r15, %rbp
+; AVX2-NEXT:    sbbq %r14, %rbp
+; AVX2-NEXT:    setb %bpl
 ; AVX2-NEXT:    cmpq %r11, %rbx
-; AVX2-NEXT:    sbbq %r13, %r12
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    sbbb $0, %bpl
+; AVX2-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %r10, %r11
-; AVX2-NEXT:    movq %r8, %rbx
-; AVX2-NEXT:    sbbq %rbp, %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmpq %r11, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    sbbq %r12, %r14
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    cmpq %rbx, %r11
+; AVX2-NEXT:    sbbq %rdx, %r12
+; AVX2-NEXT:    sbbb $0, %bpl
+; AVX2-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    cmpq %rdx, %r11
+; AVX2-NEXT:    movq %r13, %rbx
+; AVX2-NEXT:    sbbq %r10, %rbx
 ; AVX2-NEXT:    setb %bl
-; AVX2-NEXT:    cmpq %r11, %r10
-; AVX2-NEXT:    sbbq %r8, %rbp
+; AVX2-NEXT:    cmpq %r11, %rdx
+; AVX2-NEXT:    sbbq %r13, %r10
 ; AVX2-NEXT:    sbbb $0, %bl
 ; AVX2-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    cmpq %r8, %r10
-; AVX2-NEXT:    movq %rdx, %r11
-; AVX2-NEXT:    sbbq %r14, %r11
+; AVX2-NEXT:    cmpq %rdx, %r10
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    sbbq %r8, %r11
 ; AVX2-NEXT:    setb %r11b
-; AVX2-NEXT:    cmpq %r10, %r8
-; AVX2-NEXT:    sbbq %rdx, %r14
+; AVX2-NEXT:    cmpq %r10, %rdx
+; AVX2-NEXT:    sbbq %rax, %r8
 ; AVX2-NEXT:    sbbb $0, %r11b
 ; AVX2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    cmpq %rdx, %r8
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    sbbq %r15, %r10
-; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r8, %rdx
-; AVX2-NEXT:    sbbq %rax, %r15
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX2-NEXT:    cmpq %rax, %rdx
@@ -2420,32 +2423,32 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    movq %r11, %r8
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r12b
+; AVX2-NEXT:    setb %r13b
 ; AVX2-NEXT:    cmpq %rdx, %rax
 ; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r12b
+; AVX2-NEXT:    sbbb $0, %r13b
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    cmpq %rax, %r8
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
+; AVX2-NEXT:    movq %r11, %rdx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    sbbq %r10, %rdx
+; AVX2-NEXT:    setb %dl
+; AVX2-NEXT:    cmpq %r8, %rax
 ; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    sbbb $0, %dl
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-NEXT:    cmpq %rax, %r10
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    movq %rbx, %rdx
+; AVX2-NEXT:    movq %rbx, %r8
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r11, %rdx
-; AVX2-NEXT:    setb %dl
+; AVX2-NEXT:    sbbq %r11, %r8
+; AVX2-NEXT:    setb %r8b
 ; AVX2-NEXT:    cmpq %r10, %rax
 ; AVX2-NEXT:    sbbq %rbx, %r11
-; AVX2-NEXT:    sbbb $0, %dl
+; AVX2-NEXT:    sbbb $0, %r8b
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX2-NEXT:    cmpq %rax, %r11
@@ -2471,23 +2474,23 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX2-NEXT:    cmpq %rax, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %rbx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq %r12, %rbx
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; AVX2-NEXT:    sbbq %r15, %rbx
 ; AVX2-NEXT:    setb %bl
 ; AVX2-NEXT:    cmpq %r14, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    sbbq %r12, %r15
 ; AVX2-NEXT:    sbbb $0, %bl
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    cmpq %r9, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq %r12, %r14
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; AVX2-NEXT:    sbbq %r15, %r14
 ; AVX2-NEXT:    setb %bpl
 ; AVX2-NEXT:    cmpq %rax, %r9
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    sbbq %r12, %r15
 ; AVX2-NEXT:    sbbb $0, %bpl
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    cmpq %rsi, %rax
@@ -2512,13 +2515,13 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    sbbb $0, %sil
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX2-NEXT:    cmpq %rax, %rcx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq %r12, %r14
 ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
 ; AVX2-NEXT:    sbbq %r15, %r14
 ; AVX2-NEXT:    setb %r14b
 ; AVX2-NEXT:    cmpq %rcx, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    sbbq %r12, %r15
 ; AVX2-NEXT:    movq %rdi, %rax
 ; AVX2-NEXT:    sbbb $0, %r14b
 ; AVX2-NEXT:    movzbl %r14b, %ecx
@@ -2544,18 +2547,18 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    movzbl %r10b, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    movzbl %dl, %edx
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    movzbl %r8b, %edi
 ; AVX2-NEXT:    andl $3, %edi
-; AVX2-NEXT:    shll $14, %edi
-; AVX2-NEXT:    orq %rdx, %rdi
-; AVX2-NEXT:    movzbl %r12b, %ecx
+; AVX2-NEXT:    shll $12, %edi
+; AVX2-NEXT:    orq %rcx, %rdi
+; AVX2-NEXT:    movzbl %dl, %edx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    shll $14, %edx
+; AVX2-NEXT:    orq %rdi, %rdx
+; AVX2-NEXT:    movzbl %r13b, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orq %rdi, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
 ; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
 ; AVX2-NEXT:    andl $3, %edx
@@ -2672,14 +2675,14 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    andl $127, %eax
 ; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT:    andl $127, %ebp
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT:    andl $127, %r12d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; AVX512-NEXT:    andl $127, %r13d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512-NEXT:    andl $127, %ebp
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
 ; AVX512-NEXT:    andl $127, %r15d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    andl $127, %r12d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    andl $127, %r10d
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
@@ -2739,21 +2742,21 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r15, %rdx
-; AVX512-NEXT:    sbbq %r13, %rdx
+; AVX512-NEXT:    movq %r12, %rdx
+; AVX512-NEXT:    sbbq %r15, %rdx
 ; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r15, %r13
+; AVX512-NEXT:    sbbq %r12, %r15
 ; AVX512-NEXT:    sbbb $0, %dl
 ; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r12, %rdx
-; AVX512-NEXT:    sbbq %rbp, %rdx
+; AVX512-NEXT:    movq %rbp, %rdx
+; AVX512-NEXT:    sbbq %r13, %rdx
 ; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r12, %rbp
+; AVX512-NEXT:    sbbq %rbp, %r13
 ; AVX512-NEXT:    sbbb $0, %dl
 ; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
@@ -2763,10 +2766,11 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rdx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %r13b
+; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rcx, %rax
 ; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r13b
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
@@ -2774,10 +2778,10 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rdx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    setb %r13b
 ; AVX512-NEXT:    cmpq %rcx, %rax
 ; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %bpl
+; AVX512-NEXT:    sbbb $0, %r13b
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    cmpq %rcx, %rdx
@@ -2785,111 +2789,111 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    movq %rdi, %rax
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; AVX512-NEXT:    sbbq %rsi, %rax
-; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    setb %al
 ; AVX512-NEXT:    cmpq %rdx, %rcx
 ; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    sbbb $0, %al
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    movq %r8, %rcx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rcx
+; AVX512-NEXT:    sbbq %rdi, %rcx
 ; AVX512-NEXT:    setb %cl
 ; AVX512-NEXT:    cmpq %rsi, %rdx
-; AVX512-NEXT:    sbbq %rdi, %rax
+; AVX512-NEXT:    sbbq %r8, %rdi
 ; AVX512-NEXT:    sbbb $0, %cl
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    movq %r9, %rdx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    movq %r8, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rdx
+; AVX512-NEXT:    sbbq %r8, %rdx
 ; AVX512-NEXT:    setb %dl
 ; AVX512-NEXT:    cmpq %rdi, %rsi
-; AVX512-NEXT:    sbbq %r8, %rax
+; AVX512-NEXT:    sbbq %r9, %r8
 ; AVX512-NEXT:    sbbb $0, %dl
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; AVX512-NEXT:    cmpq %rdi, %r8
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; AVX512-NEXT:    movq %r10, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rsi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r9, %rsi
 ; AVX512-NEXT:    setb %sil
 ; AVX512-NEXT:    cmpq %r8, %rdi
-; AVX512-NEXT:    sbbq %r10, %rax
+; AVX512-NEXT:    sbbq %r10, %r9
 ; AVX512-NEXT:    sbbb $0, %sil
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    cmpq %r8, %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    cmpq %r8, %r9
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; AVX512-NEXT:    movq %r11, %rdi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r10, %rdi
 ; AVX512-NEXT:    setb %dil
-; AVX512-NEXT:    cmpq %r10, %r8
-; AVX512-NEXT:    sbbq %r11, %rax
+; AVX512-NEXT:    cmpq %r9, %r8
+; AVX512-NEXT:    sbbq %r11, %r10
 ; AVX512-NEXT:    sbbb $0, %dil
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    cmpq %rax, %r10
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    cmpq %r10, %r9
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; AVX512-NEXT:    movq %rbx, %r8
 ; AVX512-NEXT:    movq (%rsp), %r11 # 8-byte Reload
 ; AVX512-NEXT:    sbbq %r11, %r8
 ; AVX512-NEXT:    setb %r8b
-; AVX512-NEXT:    cmpq %r10, %rax
+; AVX512-NEXT:    cmpq %r9, %r10
 ; AVX512-NEXT:    sbbq %rbx, %r11
 ; AVX512-NEXT:    sbbb $0, %r8b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    cmpq %rbx, %r11
+; AVX512-NEXT:    cmpq %rbx, %r10
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %r10
-; AVX512-NEXT:    setb %r10b
-; AVX512-NEXT:    cmpq %r11, %rbx
-; AVX512-NEXT:    sbbq %r14, %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    sbbb $0, %r10b
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    cmpq %r15, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %r14, %r9
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r11, %r9
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %r10, %rbx
+; AVX512-NEXT:    sbbq %r14, %r11
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    sbbb $0, %r9b
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r14, %rbx
-; AVX512-NEXT:    setb %bl
-; AVX512-NEXT:    cmpq %r11, %r15
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    sbbq %rax, %r14
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT:    sbbb $0, %bl
-; AVX512-NEXT:    cmpq %r11, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %r15
+; AVX512-NEXT:    cmpq %r14, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    movq %r15, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    sbbq %rbx, %r11
+; AVX512-NEXT:    setb %r11b
+; AVX512-NEXT:    cmpq %r10, %r14
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    sbbq %r15, %rbx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    sbbb $0, %r11b
+; AVX512-NEXT:    cmpq %r10, %rbx
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT:    movq %r12, %r14
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r15, %r14
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    cmpq %rbx, %r10
 ; AVX512-NEXT:    sbbq %r12, %r15
-; AVX512-NEXT:    setb %r15b
-; AVX512-NEXT:    cmpq %r14, %r11
-; AVX512-NEXT:    sbbq %rax, %r12
-; AVX512-NEXT:    sbbb $0, %r15b
-; AVX512-NEXT:    movzbl %r15b, %r11d
-; AVX512-NEXT:    andl $3, %r11d
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movb %r11b, 4(%r14)
-; AVX512-NEXT:    movzbl %bl, %r11d
-; AVX512-NEXT:    andl $3, %r11d
-; AVX512-NEXT:    movzbl %r10b, %r10d
+; AVX512-NEXT:    sbbb $0, %bpl
+; AVX512-NEXT:    movzbl %bpl, %r10d
 ; AVX512-NEXT:    andl $3, %r10d
-; AVX512-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    movb %r10b, 4(%rbx)
+; AVX512-NEXT:    movzbl %r11b, %r10d
+; AVX512-NEXT:    andl $3, %r10d
+; AVX512-NEXT:    movzbl %r9b, %r9d
+; AVX512-NEXT:    andl $3, %r9d
+; AVX512-NEXT:    leaq (%r9,%r10,4), %r9
 ; AVX512-NEXT:    movzbl %r8b, %r8d
 ; AVX512-NEXT:    andl $3, %r8d
 ; AVX512-NEXT:    shll $4, %r8d
-; AVX512-NEXT:    orq %r10, %r8
+; AVX512-NEXT:    orq %r9, %r8
 ; AVX512-NEXT:    movzbl %dil, %edi
 ; AVX512-NEXT:    andl $3, %edi
 ; AVX512-NEXT:    shll $6, %edi
@@ -2905,16 +2909,16 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    andl $3, %ecx
 ; AVX512-NEXT:    shll $12, %ecx
 ; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    movzbl %r9b, %edx
+; AVX512-NEXT:    movzbl %al, %edx
 ; AVX512-NEXT:    andl $3, %edx
 ; AVX512-NEXT:    shll $14, %edx
 ; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movzbl %bpl, %eax
+; AVX512-NEXT:    movzbl %r13b, %eax
 ; AVX512-NEXT:    andl $3, %eax
 ; AVX512-NEXT:    shll $16, %eax
 ; AVX512-NEXT:    orq %rdx, %rax
 ; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    movzbl %r13b, %ecx
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; AVX512-NEXT:    andl $3, %ecx
 ; AVX512-NEXT:    shll $18, %ecx
 ; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
@@ -2942,8 +2946,8 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    shlq $30, %rdx
 ; AVX512-NEXT:    orq %rax, %rdx
 ; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    movl %edx, (%r14)
+; AVX512-NEXT:    movq %rbx, %rax
+; AVX512-NEXT:    movl %edx, (%rbx)
 ; AVX512-NEXT:    addq $88, %rsp
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
@@ -3341,70 +3345,73 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %edx, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %esi, %ecx
 ; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ebx, %edx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sbbb $0, %bl
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %ecx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    sbbl %edx, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %bh
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sbbb $0, %bh
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dh # 1-byte Reload
+; X86-NEXT:    sbbb $0, %dh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    setb %dl
+; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    andl $3, %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %cl, 4(%edi)
-; X86-NEXT:    movzbl %bh, %ebp
-; X86-NEXT:    movzbl %bl, %ecx
+; X86-NEXT:    movb %al, 4(%edi)
+; X86-NEXT:    movzbl %dh, %ebp
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c9..e58524dc3334a0 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -93,7 +93,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %ecx
@@ -111,7 +111,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -128,23 +128,23 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %edi, %ecx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %esi, %edi
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ebp
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    addl %edi, %esi
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %ebp
-; X86-NEXT:    addl (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebx, %ebp
-; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -163,21 +163,21 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %ecx
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
 ; X86-NEXT:    addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    mull %ecx
@@ -190,8 +190,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    mull %edx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
 ; X86-NEXT:    movl %eax, %ebp
@@ -206,7 +205,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl %edi, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -219,6 +218,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
@@ -235,7 +235,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    mull %ebx
 ; X86-NEXT:    addl %esi, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movzbl %cl, %eax
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -269,7 +269,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    adcl %eax, %ebx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    adcl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -283,8 +283,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    mull %ecx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -297,22 +296,24 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ecx, %edi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %esi
-; X86-NEXT:    addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    addl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    imull %edx, %ecx
+; X86-NEXT:    imull %edx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %edx
-; X86-NEXT:    addl %edx, %ecx
+; X86-NEXT:    addl %edx, (%esp) # 4-byte Folded Spill
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    addl %ecx, %ebx
+; X86-NEXT:    addl (%esp), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    addl %edi, %edx
 ; X86-NEXT:    adcl %esi, %ebx
@@ -320,9 +321,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %edx
@@ -334,7 +334,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ecx
@@ -376,15 +376,15 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %ecx, %ebp
-; X86-NEXT:    setb (%esp) # 1-byte Folded Spill
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebp, %ecx
-; X86-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %ebx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -392,7 +392,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, %edi
@@ -403,7 +403,7 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ebp, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl %edi, %esi
 ; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -413,8 +413,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    addl %esi, %ebp
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    adcl %eax, %edi
-; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    adcl %ebx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    addl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    adcl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -437,12 +437,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
@@ -496,9 +496,9 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
 ; X86-NEXT:    movl %edx, 16(%ecx)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, 20(%ecx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, 24(%ecx)
 ; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, 24(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, 28(%ecx)
 ; X86-NEXT:    movl %eax, 32(%ecx)
 ; X86-NEXT:    andl $4095, %ebx # imm = 0xFFF
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index eacc714b49a4d4..ae345d838de5b5 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -307,22 +307,22 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    adcl %edx, %ecx
 ; X86-NEXT:    adcl $0, %esi
@@ -357,7 +357,7 @@ define i64 @func9(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    movl %edi, %eax
@@ -365,11 +365,11 @@ define i64 @func9(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    addl %ebp, %ebx
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    adcl %edx, %ecx
diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll
index 8c7078c7263284..3381228fbca34f 100644
--- a/llvm/test/CodeGen/X86/umul_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll
@@ -274,35 +274,40 @@ define i64 @func5(i64 %x, i64 %y) {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 24
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    andb %dl, %cl
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bl
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
 ; X86-NEXT:    orb %cl, %ch
 ; X86-NEXT:    leal (%edi,%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb %ch, %cl
 ; X86-NEXT:    movl $-1, %ecx
 ; X86-NEXT:    cmovnel %ecx, %eax
 ; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
@@ -442,22 +447,22 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %esi
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %esi, %eax
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    adcl $0, %edi
@@ -496,22 +501,22 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    adcl $0, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    adcl %esi, %edx
 ; X86-NEXT:    adcl $0, %ecx
diff --git a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
index 132683cdb0f9e7..98f14108892f42 100644
--- a/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-64-legalisation-lowering.ll
@@ -12,32 +12,37 @@ define { i64, i8 } @mulodi_test(i64 %l, i64 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 24
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    setne %dl
 ; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    setne %cl
 ; X86-NEXT:    andb %dl, %cl
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    seto %bl
+; X86-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    mull %ebp
+; X86-NEXT:    mull %ebx
 ; X86-NEXT:    seto %ch
-; X86-NEXT:    orb %bl, %ch
+; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
 ; X86-NEXT:    orb %cl, %ch
 ; X86-NEXT:    leal (%edi,%eax), %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    mull %ebp
 ; X86-NEXT:    addl %esi, %edx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb %ch, %cl
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1ca..704f81ad96687d 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -634,105 +634,102 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movl %edx, %r11d
+; CHECK-BASELINE-NEXT:    movl %ecx, %r10d
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    xorb %r10b, %sil
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorb %dil, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    xorb %r10b, %sil
-; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    xorb %dil, %sil
+; CHECK-BASELINE-NEXT:    xorb %cl, %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %cl, %dl
+; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
 ; CHECK-BASELINE-NEXT:    xorb %bl, %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-BASELINE-NEXT:    xorb %bl, %r8b
-; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %r9b
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    xorb %r12b, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %r12b, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    xorb %bpl, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %bpl, %cl
+; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    xorb %r12b, %r14b
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r14b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-BASELINE-NEXT:    xorb %r12b, %r14b
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
-; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
-; CHECK-BASELINE-NEXT:    xorb %bpl, %r12b
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    xorb %r15b, %sil
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-BASELINE-NEXT:    xorb %r13b, %dl
+; CHECK-BASELINE-NEXT:    xorb %r15b, %r14b
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorb %r13b, %dil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-BASELINE-NEXT:    xorb %r13b, %dil
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
+; CHECK-BASELINE-NEXT:    xorb %cl, %r11b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-BASELINE-NEXT:    xorb %cl, %r11b
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorb %al, %r13b
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    xorb %cl, %r13b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    xorb %al, %r13b
+; CHECK-BASELINE-NEXT:    xorb %cl, %r13b
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    xorb %cl, %r15b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-BASELINE-NEXT:    xorb %al, %r15b
+; CHECK-BASELINE-NEXT:    xorb %cl, %r15b
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    xorb %cl, %bpl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-BASELINE-NEXT:    xorb %al, %bpl
+; CHECK-BASELINE-NEXT:    xorb %cl, %bpl
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorb %al, %bl
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    xorb %cl, %bl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-BASELINE-NEXT:    xorb %al, %bl
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
-; CHECK-BASELINE-NEXT:    xorb %r8b, %al
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-BASELINE-NEXT:    xorb %r8b, %al
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
-; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-BASELINE-NEXT:    xorb %r8b, %r10b
-; CHECK-BASELINE-NEXT:    movb %r10b, 15(%rdi)
-; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdi)
-; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r15b, 11(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rdi)
-; CHECK-BASELINE-NEXT:    movb %cl, 9(%rdi)
-; CHECK-BASELINE-NEXT:    movb %dl, 8(%rdi)
-; CHECK-BASELINE-NEXT:    movb %sil, 7(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r12b, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r14b, 5(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movb %al, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movb %al, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movb %al, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    xorb %cl, %bl
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    xorb %r12b, %cl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    xorb %r12b, %cl
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r12b
+; CHECK-BASELINE-NEXT:    movb %r12b, 15(%rax)
+; CHECK-BASELINE-NEXT:    movb %cl, 14(%rax)
+; CHECK-BASELINE-NEXT:    movb %bl, 13(%rax)
+; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rax)
+; CHECK-BASELINE-NEXT:    movb %r15b, 11(%rax)
+; CHECK-BASELINE-NEXT:    movb %r13b, 10(%rax)
+; CHECK-BASELINE-NEXT:    movb %r11b, 9(%rax)
+; CHECK-BASELINE-NEXT:    movb %dil, 8(%rax)
+; CHECK-BASELINE-NEXT:    movb %r14b, 7(%rax)
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movb %cl, 6(%rax)
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movb %cl, 5(%rax)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rax)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %r10b, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -749,105 +746,102 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movl %edx, %r11d
+; CHECK-SSE1-NEXT:    movl %ecx, %r10d
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    xorb %r10b, %sil
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorb %dil, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    xorb %r10b, %sil
-; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    xorb %dl, %r11b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
-; CHECK-SSE1-NEXT:    xorb %dl, %r11b
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    xorb %dil, %sil
+; CHECK-SSE1-NEXT:    xorb %cl, %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %cl, %dl
+; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
+; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
 ; CHECK-SSE1-NEXT:    xorb %bl, %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-SSE1-NEXT:    xorb %bl, %r8b
-; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-SSE1-NEXT:    xorb %r14b, %r9b
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    xorb %r12b, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %r12b, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    xorb %bpl, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %bpl, %cl
+; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    xorb %r12b, %r14b
+; CHECK-SSE1-NEXT:    xorb %r15b, %r14b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r14b
-; CHECK-SSE1-NEXT:    xorb %r12b, %r14b
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
-; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
-; CHECK-SSE1-NEXT:    xorb %bpl, %r12b
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-SSE1-NEXT:    xorb %r15b, %sil
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    xorb %r15b, %sil
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
-; CHECK-SSE1-NEXT:    xorb %r13b, %dl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
-; CHECK-SSE1-NEXT:    xorb %r13b, %dl
+; CHECK-SSE1-NEXT:    xorb %r15b, %r14b
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorb %r13b, %dil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
+; CHECK-SSE1-NEXT:    xorb %r13b, %dil
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
+; CHECK-SSE1-NEXT:    xorb %cl, %r11b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r11b
+; CHECK-SSE1-NEXT:    xorb %cl, %r11b
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorb %al, %r13b
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    xorb %cl, %r13b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    xorb %al, %r13b
+; CHECK-SSE1-NEXT:    xorb %cl, %r13b
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    xorb %cl, %r15b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
-; CHECK-SSE1-NEXT:    xorb %al, %r15b
+; CHECK-SSE1-NEXT:    xorb %cl, %r15b
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    xorb %cl, %bpl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
-; CHECK-SSE1-NEXT:    xorb %al, %bpl
+; CHECK-SSE1-NEXT:    xorb %cl, %bpl
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorb %al, %bl
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    xorb %cl, %bl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
-; CHECK-SSE1-NEXT:    xorb %al, %bl
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
-; CHECK-SSE1-NEXT:    xorb %r8b, %al
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
-; CHECK-SSE1-NEXT:    xorb %r8b, %al
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r8d
-; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
-; CHECK-SSE1-NEXT:    xorb %r8b, %r10b
-; CHECK-SSE1-NEXT:    movb %r10b, 15(%rdi)
-; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
-; CHECK-SSE1-NEXT:    movb %bl, 13(%rdi)
-; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdi)
-; CHECK-SSE1-NEXT:    movb %r15b, 11(%rdi)
-; CHECK-SSE1-NEXT:    movb %r13b, 10(%rdi)
-; CHECK-SSE1-NEXT:    movb %cl, 9(%rdi)
-; CHECK-SSE1-NEXT:    movb %dl, 8(%rdi)
-; CHECK-SSE1-NEXT:    movb %sil, 7(%rdi)
-; CHECK-SSE1-NEXT:    movb %r12b, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb %r14b, 5(%rdi)
-; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movb %al, 3(%rdi)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movb %al, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %r11b, 1(%rdi)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movb %al, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    xorb %cl, %bl
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    xorb %r12b, %cl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    xorb %r12b, %cl
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r12b
+; CHECK-SSE1-NEXT:    movb %r12b, 15(%rax)
+; CHECK-SSE1-NEXT:    movb %cl, 14(%rax)
+; CHECK-SSE1-NEXT:    movb %bl, 13(%rax)
+; CHECK-SSE1-NEXT:    movb %bpl, 12(%rax)
+; CHECK-SSE1-NEXT:    movb %r15b, 11(%rax)
+; CHECK-SSE1-NEXT:    movb %r13b, 10(%rax)
+; CHECK-SSE1-NEXT:    movb %r11b, 9(%rax)
+; CHECK-SSE1-NEXT:    movb %dil, 8(%rax)
+; CHECK-SSE1-NEXT:    movb %r14b, 7(%rax)
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    movb %cl, 6(%rax)
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    movb %cl, 5(%rax)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rax)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %r10b, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %sil, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -1194,10 +1188,7 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r10
-; CHECK-BASELINE-NEXT:    movq %rdx, %r8
-; CHECK-BASELINE-NEXT:    movq %rsi, %r9
-; CHECK-BASELINE-NEXT:    movq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movq %rdi, %r8
 ; CHECK-BASELINE-NEXT:    movzbl 15(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movzbl 14(%rdx), %eax
@@ -1215,236 +1206,236 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r15d
 ; CHECK-BASELINE-NEXT:    movzbl 6(%rdx), %r12d
 ; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %esi
-; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzbl 2(%r8), %edi
-; CHECK-BASELINE-NEXT:    movzbl (%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 1(%r8), %ecx
-; CHECK-BASELINE-NEXT:    movzbl (%r9), %ebx
-; CHECK-BASELINE-NEXT:    xorb %al, %bl
-; CHECK-BASELINE-NEXT:    andb (%r10), %bl
-; CHECK-BASELINE-NEXT:    xorb %al, %bl
+; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzbl 2(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzbl (%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 1(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzbl (%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorb %dil, %bl
+; CHECK-BASELINE-NEXT:    andb (%rcx), %bl
+; CHECK-BASELINE-NEXT:    xorb %dil, %bl
 ; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 1(%r9), %eax
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 1(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 2(%r9), %eax
-; CHECK-BASELINE-NEXT:    xorb %dil, %al
-; CHECK-BASELINE-NEXT:    andb 2(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movzbl 1(%rsi), %edi
+; CHECK-BASELINE-NEXT:    xorb %al, %dil
+; CHECK-BASELINE-NEXT:    andb 1(%rcx), %dil
+; CHECK-BASELINE-NEXT:    xorb %al, %dil
+; CHECK-BASELINE-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 2(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r9b, %al
+; CHECK-BASELINE-NEXT:    andb 2(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r9b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 3(%r9), %eax
-; CHECK-BASELINE-NEXT:    xorb %dl, %al
-; CHECK-BASELINE-NEXT:    andb 3(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    movzbl 3(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r10b, %al
+; CHECK-BASELINE-NEXT:    andb 3(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r10b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 4(%r9), %eax
-; CHECK-BASELINE-NEXT:    xorb %sil, %al
-; CHECK-BASELINE-NEXT:    andb 4(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %sil, %al
+; CHECK-BASELINE-NEXT:    movzbl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    andb 4(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 5(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzbl 5(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %r13b, %al
-; CHECK-BASELINE-NEXT:    andb 5(%r10), %al
+; CHECK-BASELINE-NEXT:    andb 5(%rcx), %al
 ; CHECK-BASELINE-NEXT:    xorb %r13b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 6(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzbl 6(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %r12b, %al
-; CHECK-BASELINE-NEXT:    andb 6(%r10), %al
+; CHECK-BASELINE-NEXT:    andb 6(%rcx), %al
 ; CHECK-BASELINE-NEXT:    xorb %r12b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 7(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzbl 7(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %r15b, %al
-; CHECK-BASELINE-NEXT:    andb 7(%r10), %al
+; CHECK-BASELINE-NEXT:    andb 7(%rcx), %al
 ; CHECK-BASELINE-NEXT:    xorb %r15b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 8(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzbl 8(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %al
-; CHECK-BASELINE-NEXT:    andb 8(%r10), %al
+; CHECK-BASELINE-NEXT:    andb 8(%rcx), %al
 ; CHECK-BASELINE-NEXT:    xorb %r14b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 9(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzbl 9(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %bpl, %al
-; CHECK-BASELINE-NEXT:    andb 9(%r10), %al
+; CHECK-BASELINE-NEXT:    andb 9(%rcx), %al
 ; CHECK-BASELINE-NEXT:    xorb %bpl, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 10(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 10(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movzbl 10(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 10(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 11(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 11(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movzbl 11(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 11(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 12(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 12(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movzbl 12(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 12(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 13(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 13(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movzbl 13(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 13(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 14(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 14(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movzbl 14(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 14(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 15(%r9), %eax
-; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
-; CHECK-BASELINE-NEXT:    andb 15(%r10), %al
-; CHECK-BASELINE-NEXT:    xorb %cl, %al
+; CHECK-BASELINE-NEXT:    movzbl 15(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 15(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 16(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 16(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 16(%r10), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 17(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 17(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 17(%r10), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 18(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 18(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 18(%r10), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 19(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 19(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 19(%r10), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 20(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 20(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 20(%r10), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 21(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 21(%r9), %r13d
-; CHECK-BASELINE-NEXT:    xorb %al, %r13b
-; CHECK-BASELINE-NEXT:    andb 21(%r10), %r13b
-; CHECK-BASELINE-NEXT:    xorb %al, %r13b
-; CHECK-BASELINE-NEXT:    movzbl 22(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 22(%r9), %r12d
-; CHECK-BASELINE-NEXT:    xorb %al, %r12b
-; CHECK-BASELINE-NEXT:    andb 22(%r10), %r12b
-; CHECK-BASELINE-NEXT:    xorb %al, %r12b
-; CHECK-BASELINE-NEXT:    movzbl 23(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 23(%r9), %r15d
-; CHECK-BASELINE-NEXT:    xorb %al, %r15b
-; CHECK-BASELINE-NEXT:    andb 23(%r10), %r15b
-; CHECK-BASELINE-NEXT:    xorb %al, %r15b
-; CHECK-BASELINE-NEXT:    movzbl 24(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 24(%r9), %r14d
-; CHECK-BASELINE-NEXT:    xorb %al, %r14b
-; CHECK-BASELINE-NEXT:    andb 24(%r10), %r14b
-; CHECK-BASELINE-NEXT:    xorb %al, %r14b
-; CHECK-BASELINE-NEXT:    movzbl 25(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 25(%r9), %ebp
-; CHECK-BASELINE-NEXT:    xorb %al, %bpl
-; CHECK-BASELINE-NEXT:    andb 25(%r10), %bpl
-; CHECK-BASELINE-NEXT:    xorb %al, %bpl
-; CHECK-BASELINE-NEXT:    movzbl 26(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 26(%r9), %edi
+; CHECK-BASELINE-NEXT:    movzbl 16(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 16(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 16(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 17(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 17(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 17(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 18(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 18(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 18(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 19(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 19(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 19(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 20(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 20(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 20(%rcx), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 21(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 21(%rsi), %r13d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r13b
+; CHECK-BASELINE-NEXT:    andb 21(%rcx), %r13b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r13b
+; CHECK-BASELINE-NEXT:    movzbl 22(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 22(%rsi), %r12d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r12b
+; CHECK-BASELINE-NEXT:    andb 22(%rcx), %r12b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r12b
+; CHECK-BASELINE-NEXT:    movzbl 23(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 23(%rsi), %r15d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r15b
+; CHECK-BASELINE-NEXT:    andb 23(%rcx), %r15b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r15b
+; CHECK-BASELINE-NEXT:    movzbl 24(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 24(%rsi), %r14d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r14b
+; CHECK-BASELINE-NEXT:    andb 24(%rcx), %r14b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r14b
+; CHECK-BASELINE-NEXT:    movzbl 25(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 25(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorb %dil, %bpl
+; CHECK-BASELINE-NEXT:    andb 25(%rcx), %bpl
+; CHECK-BASELINE-NEXT:    xorb %dil, %bpl
+; CHECK-BASELINE-NEXT:    movzbl 26(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 26(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r11b
+; CHECK-BASELINE-NEXT:    andb 26(%rcx), %r11b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r11b
+; CHECK-BASELINE-NEXT:    movzbl 27(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 27(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
+; CHECK-BASELINE-NEXT:    andb 27(%rcx), %r10b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
+; CHECK-BASELINE-NEXT:    movzbl 28(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 28(%rsi), %r9d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
+; CHECK-BASELINE-NEXT:    andb 28(%rcx), %r9b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
+; CHECK-BASELINE-NEXT:    movzbl 29(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzbl 29(%rsi), %edi
 ; CHECK-BASELINE-NEXT:    xorb %al, %dil
-; CHECK-BASELINE-NEXT:    andb 26(%r10), %dil
+; CHECK-BASELINE-NEXT:    andb 29(%rcx), %dil
 ; CHECK-BASELINE-NEXT:    xorb %al, %dil
-; CHECK-BASELINE-NEXT:    movzbl 27(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 27(%r9), %esi
-; CHECK-BASELINE-NEXT:    xorb %al, %sil
-; CHECK-BASELINE-NEXT:    andb 27(%r10), %sil
-; CHECK-BASELINE-NEXT:    xorb %al, %sil
-; CHECK-BASELINE-NEXT:    movzbl 28(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 28(%r9), %edx
-; CHECK-BASELINE-NEXT:    xorb %al, %dl
-; CHECK-BASELINE-NEXT:    andb 28(%r10), %dl
-; CHECK-BASELINE-NEXT:    xorb %al, %dl
-; CHECK-BASELINE-NEXT:    movzbl 29(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzbl 29(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 29(%r10), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movzbl 30(%r8), %ebx
-; CHECK-BASELINE-NEXT:    movzbl 30(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzbl 30(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzbl 30(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %bl, %al
-; CHECK-BASELINE-NEXT:    andb 30(%r10), %al
+; CHECK-BASELINE-NEXT:    andb 30(%rcx), %al
 ; CHECK-BASELINE-NEXT:    xorb %bl, %al
-; CHECK-BASELINE-NEXT:    movzbl 31(%r8), %r8d
-; CHECK-BASELINE-NEXT:    movzbl 31(%r9), %r9d
-; CHECK-BASELINE-NEXT:    xorb %r8b, %r9b
-; CHECK-BASELINE-NEXT:    andb 31(%r10), %r9b
-; CHECK-BASELINE-NEXT:    xorb %r8b, %r9b
-; CHECK-BASELINE-NEXT:    movb %r9b, 31(%r11)
-; CHECK-BASELINE-NEXT:    movb %al, 30(%r11)
-; CHECK-BASELINE-NEXT:    movb %cl, 29(%r11)
-; CHECK-BASELINE-NEXT:    movb %dl, 28(%r11)
-; CHECK-BASELINE-NEXT:    movb %sil, 27(%r11)
-; CHECK-BASELINE-NEXT:    movb %dil, 26(%r11)
-; CHECK-BASELINE-NEXT:    movb %bpl, 25(%r11)
-; CHECK-BASELINE-NEXT:    movb %r14b, 24(%r11)
-; CHECK-BASELINE-NEXT:    movb %r15b, 23(%r11)
-; CHECK-BASELINE-NEXT:    movb %r12b, 22(%r11)
-; CHECK-BASELINE-NEXT:    movb %r13b, 21(%r11)
+; CHECK-BASELINE-NEXT:    movzbl 31(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzbl 31(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorb %dl, %sil
+; CHECK-BASELINE-NEXT:    andb 31(%rcx), %sil
+; CHECK-BASELINE-NEXT:    xorb %dl, %sil
+; CHECK-BASELINE-NEXT:    movb %sil, 31(%r8)
+; CHECK-BASELINE-NEXT:    movb %al, 30(%r8)
+; CHECK-BASELINE-NEXT:    movb %dil, 29(%r8)
+; CHECK-BASELINE-NEXT:    movb %r9b, 28(%r8)
+; CHECK-BASELINE-NEXT:    movb %r10b, 27(%r8)
+; CHECK-BASELINE-NEXT:    movb %r11b, 26(%r8)
+; CHECK-BASELINE-NEXT:    movb %bpl, 25(%r8)
+; CHECK-BASELINE-NEXT:    movb %r14b, 24(%r8)
+; CHECK-BASELINE-NEXT:    movb %r15b, 23(%r8)
+; CHECK-BASELINE-NEXT:    movb %r12b, 22(%r8)
+; CHECK-BASELINE-NEXT:    movb %r13b, 21(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 20(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 20(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 19(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 19(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 18(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 18(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 17(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 17(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 16(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 16(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 15(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 15(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 14(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 14(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 13(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 13(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 12(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 12(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 11(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 11(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 10(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 10(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 9(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 9(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 8(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 8(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 7(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 7(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 6(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 6(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 5(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 5(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 4(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 4(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 3(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 3(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 2(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 2(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 1(%r11)
+; CHECK-BASELINE-NEXT:    movb %al, 1(%r8)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, (%r11)
-; CHECK-BASELINE-NEXT:    movq %r11, %rax
+; CHECK-BASELINE-NEXT:    movb %al, (%r8)
+; CHECK-BASELINE-NEXT:    movq %r8, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1461,10 +1452,7 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r10
-; CHECK-SSE1-NEXT:    movq %rdx, %r8
-; CHECK-SSE1-NEXT:    movq %rsi, %r9
-; CHECK-SSE1-NEXT:    movq %rdi, %r11
+; CHECK-SSE1-NEXT:    movq %rdi, %r8
 ; CHECK-SSE1-NEXT:    movzbl 15(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movzbl 14(%rdx), %eax
@@ -1482,236 +1470,236 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r15d
 ; CHECK-SSE1-NEXT:    movzbl 6(%rdx), %r12d
 ; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %esi
-; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzbl 2(%r8), %edi
-; CHECK-SSE1-NEXT:    movzbl (%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 1(%r8), %ecx
-; CHECK-SSE1-NEXT:    movzbl (%r9), %ebx
-; CHECK-SSE1-NEXT:    xorb %al, %bl
-; CHECK-SSE1-NEXT:    andb (%r10), %bl
-; CHECK-SSE1-NEXT:    xorb %al, %bl
+; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzbl 2(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzbl (%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 1(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzbl (%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorb %dil, %bl
+; CHECK-SSE1-NEXT:    andb (%rcx), %bl
+; CHECK-SSE1-NEXT:    xorb %dil, %bl
 ; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 1(%r9), %eax
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 1(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 2(%r9), %eax
-; CHECK-SSE1-NEXT:    xorb %dil, %al
-; CHECK-SSE1-NEXT:    andb 2(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movzbl 1(%rsi), %edi
+; CHECK-SSE1-NEXT:    xorb %al, %dil
+; CHECK-SSE1-NEXT:    andb 1(%rcx), %dil
+; CHECK-SSE1-NEXT:    xorb %al, %dil
+; CHECK-SSE1-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 2(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r9b, %al
+; CHECK-SSE1-NEXT:    andb 2(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r9b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 3(%r9), %eax
-; CHECK-SSE1-NEXT:    xorb %dl, %al
-; CHECK-SSE1-NEXT:    andb 3(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    movzbl 3(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r10b, %al
+; CHECK-SSE1-NEXT:    andb 3(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r10b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 4(%r9), %eax
-; CHECK-SSE1-NEXT:    xorb %sil, %al
-; CHECK-SSE1-NEXT:    andb 4(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %sil, %al
+; CHECK-SSE1-NEXT:    movzbl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    andb 4(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 5(%r9), %eax
+; CHECK-SSE1-NEXT:    movzbl 5(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %r13b, %al
-; CHECK-SSE1-NEXT:    andb 5(%r10), %al
+; CHECK-SSE1-NEXT:    andb 5(%rcx), %al
 ; CHECK-SSE1-NEXT:    xorb %r13b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 6(%r9), %eax
+; CHECK-SSE1-NEXT:    movzbl 6(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %r12b, %al
-; CHECK-SSE1-NEXT:    andb 6(%r10), %al
+; CHECK-SSE1-NEXT:    andb 6(%rcx), %al
 ; CHECK-SSE1-NEXT:    xorb %r12b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 7(%r9), %eax
+; CHECK-SSE1-NEXT:    movzbl 7(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %r15b, %al
-; CHECK-SSE1-NEXT:    andb 7(%r10), %al
+; CHECK-SSE1-NEXT:    andb 7(%rcx), %al
 ; CHECK-SSE1-NEXT:    xorb %r15b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 8(%r9), %eax
+; CHECK-SSE1-NEXT:    movzbl 8(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %r14b, %al
-; CHECK-SSE1-NEXT:    andb 8(%r10), %al
+; CHECK-SSE1-NEXT:    andb 8(%rcx), %al
 ; CHECK-SSE1-NEXT:    xorb %r14b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 9(%r9), %eax
+; CHECK-SSE1-NEXT:    movzbl 9(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %bpl, %al
-; CHECK-SSE1-NEXT:    andb 9(%r10), %al
+; CHECK-SSE1-NEXT:    andb 9(%rcx), %al
 ; CHECK-SSE1-NEXT:    xorb %bpl, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 10(%r9), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 10(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movzbl 10(%rsi), %eax
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 10(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 11(%r9), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 11(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movzbl 11(%rsi), %eax
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 11(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 12(%r9), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 12(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movzbl 12(%rsi), %eax
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 12(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 13(%r9), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 13(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movzbl 13(%rsi), %eax
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 13(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 14(%r9), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 14(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movzbl 14(%rsi), %eax
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 14(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 15(%r9), %eax
-; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorb %cl, %al
-; CHECK-SSE1-NEXT:    andb 15(%r10), %al
-; CHECK-SSE1-NEXT:    xorb %cl, %al
+; CHECK-SSE1-NEXT:    movzbl 15(%rsi), %eax
+; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 15(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 16(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 16(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 16(%r10), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 17(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 17(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 17(%r10), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 18(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 18(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 18(%r10), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 19(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 19(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 19(%r10), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 20(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 20(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 20(%r10), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 21(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 21(%r9), %r13d
-; CHECK-SSE1-NEXT:    xorb %al, %r13b
-; CHECK-SSE1-NEXT:    andb 21(%r10), %r13b
-; CHECK-SSE1-NEXT:    xorb %al, %r13b
-; CHECK-SSE1-NEXT:    movzbl 22(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 22(%r9), %r12d
-; CHECK-SSE1-NEXT:    xorb %al, %r12b
-; CHECK-SSE1-NEXT:    andb 22(%r10), %r12b
-; CHECK-SSE1-NEXT:    xorb %al, %r12b
-; CHECK-SSE1-NEXT:    movzbl 23(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 23(%r9), %r15d
-; CHECK-SSE1-NEXT:    xorb %al, %r15b
-; CHECK-SSE1-NEXT:    andb 23(%r10), %r15b
-; CHECK-SSE1-NEXT:    xorb %al, %r15b
-; CHECK-SSE1-NEXT:    movzbl 24(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 24(%r9), %r14d
-; CHECK-SSE1-NEXT:    xorb %al, %r14b
-; CHECK-SSE1-NEXT:    andb 24(%r10), %r14b
-; CHECK-SSE1-NEXT:    xorb %al, %r14b
-; CHECK-SSE1-NEXT:    movzbl 25(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 25(%r9), %ebp
-; CHECK-SSE1-NEXT:    xorb %al, %bpl
-; CHECK-SSE1-NEXT:    andb 25(%r10), %bpl
-; CHECK-SSE1-NEXT:    xorb %al, %bpl
-; CHECK-SSE1-NEXT:    movzbl 26(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 26(%r9), %edi
+; CHECK-SSE1-NEXT:    movzbl 16(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 16(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 16(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 17(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 17(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 17(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 18(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 18(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 18(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 19(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 19(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 19(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 20(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 20(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 20(%rcx), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 21(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 21(%rsi), %r13d
+; CHECK-SSE1-NEXT:    xorb %dil, %r13b
+; CHECK-SSE1-NEXT:    andb 21(%rcx), %r13b
+; CHECK-SSE1-NEXT:    xorb %dil, %r13b
+; CHECK-SSE1-NEXT:    movzbl 22(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 22(%rsi), %r12d
+; CHECK-SSE1-NEXT:    xorb %dil, %r12b
+; CHECK-SSE1-NEXT:    andb 22(%rcx), %r12b
+; CHECK-SSE1-NEXT:    xorb %dil, %r12b
+; CHECK-SSE1-NEXT:    movzbl 23(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 23(%rsi), %r15d
+; CHECK-SSE1-NEXT:    xorb %dil, %r15b
+; CHECK-SSE1-NEXT:    andb 23(%rcx), %r15b
+; CHECK-SSE1-NEXT:    xorb %dil, %r15b
+; CHECK-SSE1-NEXT:    movzbl 24(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 24(%rsi), %r14d
+; CHECK-SSE1-NEXT:    xorb %dil, %r14b
+; CHECK-SSE1-NEXT:    andb 24(%rcx), %r14b
+; CHECK-SSE1-NEXT:    xorb %dil, %r14b
+; CHECK-SSE1-NEXT:    movzbl 25(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 25(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorb %dil, %bpl
+; CHECK-SSE1-NEXT:    andb 25(%rcx), %bpl
+; CHECK-SSE1-NEXT:    xorb %dil, %bpl
+; CHECK-SSE1-NEXT:    movzbl 26(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 26(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorb %dil, %r11b
+; CHECK-SSE1-NEXT:    andb 26(%rcx), %r11b
+; CHECK-SSE1-NEXT:    xorb %dil, %r11b
+; CHECK-SSE1-NEXT:    movzbl 27(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 27(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorb %dil, %r10b
+; CHECK-SSE1-NEXT:    andb 27(%rcx), %r10b
+; CHECK-SSE1-NEXT:    xorb %dil, %r10b
+; CHECK-SSE1-NEXT:    movzbl 28(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 28(%rsi), %r9d
+; CHECK-SSE1-NEXT:    xorb %dil, %r9b
+; CHECK-SSE1-NEXT:    andb 28(%rcx), %r9b
+; CHECK-SSE1-NEXT:    xorb %dil, %r9b
+; CHECK-SSE1-NEXT:    movzbl 29(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzbl 29(%rsi), %edi
 ; CHECK-SSE1-NEXT:    xorb %al, %dil
-; CHECK-SSE1-NEXT:    andb 26(%r10), %dil
+; CHECK-SSE1-NEXT:    andb 29(%rcx), %dil
 ; CHECK-SSE1-NEXT:    xorb %al, %dil
-; CHECK-SSE1-NEXT:    movzbl 27(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 27(%r9), %esi
-; CHECK-SSE1-NEXT:    xorb %al, %sil
-; CHECK-SSE1-NEXT:    andb 27(%r10), %sil
-; CHECK-SSE1-NEXT:    xorb %al, %sil
-; CHECK-SSE1-NEXT:    movzbl 28(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 28(%r9), %edx
-; CHECK-SSE1-NEXT:    xorb %al, %dl
-; CHECK-SSE1-NEXT:    andb 28(%r10), %dl
-; CHECK-SSE1-NEXT:    xorb %al, %dl
-; CHECK-SSE1-NEXT:    movzbl 29(%r8), %eax
-; CHECK-SSE1-NEXT:    movzbl 29(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 29(%r10), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movzbl 30(%r8), %ebx
-; CHECK-SSE1-NEXT:    movzbl 30(%r9), %eax
+; CHECK-SSE1-NEXT:    movzbl 30(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzbl 30(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %bl, %al
-; CHECK-SSE1-NEXT:    andb 30(%r10), %al
+; CHECK-SSE1-NEXT:    andb 30(%rcx), %al
 ; CHECK-SSE1-NEXT:    xorb %bl, %al
-; CHECK-SSE1-NEXT:    movzbl 31(%r8), %r8d
-; CHECK-SSE1-NEXT:    movzbl 31(%r9), %r9d
-; CHECK-SSE1-NEXT:    xorb %r8b, %r9b
-; CHECK-SSE1-NEXT:    andb 31(%r10), %r9b
-; CHECK-SSE1-NEXT:    xorb %r8b, %r9b
-; CHECK-SSE1-NEXT:    movb %r9b, 31(%r11)
-; CHECK-SSE1-NEXT:    movb %al, 30(%r11)
-; CHECK-SSE1-NEXT:    movb %cl, 29(%r11)
-; CHECK-SSE1-NEXT:    movb %dl, 28(%r11)
-; CHECK-SSE1-NEXT:    movb %sil, 27(%r11)
-; CHECK-SSE1-NEXT:    movb %dil, 26(%r11)
-; CHECK-SSE1-NEXT:    movb %bpl, 25(%r11)
-; CHECK-SSE1-NEXT:    movb %r14b, 24(%r11)
-; CHECK-SSE1-NEXT:    movb %r15b, 23(%r11)
-; CHECK-SSE1-NEXT:    movb %r12b, 22(%r11)
-; CHECK-SSE1-NEXT:    movb %r13b, 21(%r11)
+; CHECK-SSE1-NEXT:    movzbl 31(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzbl 31(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorb %dl, %sil
+; CHECK-SSE1-NEXT:    andb 31(%rcx), %sil
+; CHECK-SSE1-NEXT:    xorb %dl, %sil
+; CHECK-SSE1-NEXT:    movb %sil, 31(%r8)
+; CHECK-SSE1-NEXT:    movb %al, 30(%r8)
+; CHECK-SSE1-NEXT:    movb %dil, 29(%r8)
+; CHECK-SSE1-NEXT:    movb %r9b, 28(%r8)
+; CHECK-SSE1-NEXT:    movb %r10b, 27(%r8)
+; CHECK-SSE1-NEXT:    movb %r11b, 26(%r8)
+; CHECK-SSE1-NEXT:    movb %bpl, 25(%r8)
+; CHECK-SSE1-NEXT:    movb %r14b, 24(%r8)
+; CHECK-SSE1-NEXT:    movb %r15b, 23(%r8)
+; CHECK-SSE1-NEXT:    movb %r12b, 22(%r8)
+; CHECK-SSE1-NEXT:    movb %r13b, 21(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 20(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 20(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 19(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 19(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 18(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 18(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 17(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 17(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 16(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 16(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 15(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 15(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 14(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 14(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 13(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 13(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 12(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 12(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 11(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 11(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 10(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 10(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 9(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 9(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 8(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 8(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 7(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 7(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 6(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 6(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 5(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 5(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 4(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 4(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 3(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 3(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 2(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 2(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 1(%r11)
+; CHECK-SSE1-NEXT:    movb %al, 1(%r8)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, (%r11)
-; CHECK-SSE1-NEXT:    movq %r11, %rax
+; CHECK-SSE1-NEXT:    movb %al, (%r8)
+; CHECK-SSE1-NEXT:    movq %r8, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -2492,6 +2480,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
@@ -2507,14 +2496,14 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    xorb %r10b, %al
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %sil
@@ -2524,16 +2513,15 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    xorb %bl, %r9b
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-BASELINE-NEXT:    xorb %r10b, %al
-; CHECK-BASELINE-NEXT:    movb %al, 7(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r13b, 6(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r15b, 5(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
-; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
-; CHECK-BASELINE-NEXT:    movb %dl, 1(%rdi)
-; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
+; CHECK-BASELINE-NEXT:    movb %dil, 7(%rax)
+; CHECK-BASELINE-NEXT:    movb %r13b, 6(%rax)
+; CHECK-BASELINE-NEXT:    movb %r15b, 5(%rax)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rax)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rax)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rax)
+; CHECK-BASELINE-NEXT:    movb %dl, 1(%rax)
+; CHECK-BASELINE-NEXT:    movb %sil, (%rax)
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -2550,6 +2538,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
@@ -2565,14 +2554,14 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    xorb %r10b, %al
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    xorb %r10b, %dil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r13b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    xorb %r11b, %sil
@@ -2582,16 +2571,15 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    xorb %bl, %r9b
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r15b
 ; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %r13b
-; CHECK-SSE1-NEXT:    xorb %r10b, %al
-; CHECK-SSE1-NEXT:    movb %al, 7(%rdi)
-; CHECK-SSE1-NEXT:    movb %r13b, 6(%rdi)
-; CHECK-SSE1-NEXT:    movb %r15b, 5(%rdi)
-; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
-; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
-; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
-; CHECK-SSE1-NEXT:    movb %dl, 1(%rdi)
-; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    xorb %r10b, %dil
+; CHECK-SSE1-NEXT:    movb %dil, 7(%rax)
+; CHECK-SSE1-NEXT:    movb %r13b, 6(%rax)
+; CHECK-SSE1-NEXT:    movb %r15b, 5(%rax)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rax)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rax)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rax)
+; CHECK-SSE1-NEXT:    movb %dl, 1(%rax)
+; CHECK-SSE1-NEXT:    movb %sil, (%rax)
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -2754,11 +2742,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movq %rdi, %rdx
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
@@ -2767,14 +2751,14 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; CHECK-BASELINE-NEXT:    xorb %al, %r9b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-BASELINE-NEXT:    xorb %dil, %r9b
-; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dil
-; CHECK-BASELINE-NEXT:    xorb %r10b, %dil
+; CHECK-BASELINE-NEXT:    xorb %al, %r9b
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %r10b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
@@ -2804,6 +2788,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-BASELINE-NEXT:    xorb %bl, %bpl
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-BASELINE-NEXT:    xorb %al, %bl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %bl
 ; CHECK-BASELINE-NEXT:    xorb %al, %bl
@@ -2812,45 +2797,44 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %al
 ; CHECK-BASELINE-NEXT:    xorb %cl, %al
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-BASELINE-NEXT:    xorb %sil, %cl
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb %sil, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdx)
-; CHECK-BASELINE-NEXT:    movb %al, 14(%rdx)
-; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdx)
-; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r14b, 11(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r15b, 10(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r12b, 9(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r13b, 8(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdx)
-; CHECK-BASELINE-NEXT:    movb %dil, 5(%rdx)
-; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdx)
+; CHECK-BASELINE-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
+; CHECK-BASELINE-NEXT:    movb %cl, 15(%rdi)
+; CHECK-BASELINE-NEXT:    movb %al, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bl, 13(%rdi)
+; CHECK-BASELINE-NEXT:    movb %bpl, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r14b, 11(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r15b, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r12b, 9(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r13b, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r11b, 7(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r10b, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movb %dl, 5(%rdi)
+; CHECK-BASELINE-NEXT:    movb %r9b, 4(%rdi)
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-BASELINE-NEXT:    xorb %al, %r8b
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-BASELINE-NEXT:    xorb %al, %r8b
-; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdx)
+; CHECK-BASELINE-NEXT:    movb %r8b, 3(%rdi)
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdx)
+; CHECK-BASELINE-NEXT:    movb %cl, 2(%rdi)
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdx)
+; CHECK-BASELINE-NEXT:    movb %cl, 1(%rdi)
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, (%rdx)
-; CHECK-BASELINE-NEXT:    movq %rdx, %rax
+; CHECK-BASELINE-NEXT:    xorb %al, %sil
+; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-BASELINE-NEXT:    xorb %al, %sil
+; CHECK-BASELINE-NEXT:    movb %sil, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -2869,11 +2853,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movq %rdi, %rdx
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebp
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
@@ -2882,14 +2862,14 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r13d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    xorb %dil, %r9b
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    xorb %al, %r9b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r9b
-; CHECK-SSE1-NEXT:    xorb %dil, %r9b
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    xorb %r10b, %dil
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dil
-; CHECK-SSE1-NEXT:    xorb %r10b, %dil
+; CHECK-SSE1-NEXT:    xorb %al, %r9b
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; CHECK-SSE1-NEXT:    xorb %r10b, %dl
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %dl
+; CHECK-SSE1-NEXT:    xorb %r10b, %dl
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    xorb %r11b, %r10b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r10b
@@ -2919,6 +2899,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bpl
 ; CHECK-SSE1-NEXT:    xorb %bl, %bpl
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-SSE1-NEXT:    xorb %al, %bl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %bl
 ; CHECK-SSE1-NEXT:    xorb %al, %bl
@@ -2927,45 +2908,44 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %al
 ; CHECK-SSE1-NEXT:    xorb %cl, %al
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-SSE1-NEXT:    xorb %sil, %cl
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb %sil, %cl
-; CHECK-SSE1-NEXT:    movb %cl, 15(%rdx)
-; CHECK-SSE1-NEXT:    movb %al, 14(%rdx)
-; CHECK-SSE1-NEXT:    movb %bl, 13(%rdx)
-; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdx)
-; CHECK-SSE1-NEXT:    movb %r14b, 11(%rdx)
-; CHECK-SSE1-NEXT:    movb %r15b, 10(%rdx)
-; CHECK-SSE1-NEXT:    movb %r12b, 9(%rdx)
-; CHECK-SSE1-NEXT:    movb %r13b, 8(%rdx)
-; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdx)
-; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdx)
-; CHECK-SSE1-NEXT:    movb %dil, 5(%rdx)
-; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdx)
+; CHECK-SSE1-NEXT:    xorb {{[0-9]+}}(%rsp), %cl
+; CHECK-SSE1-NEXT:    movb %cl, 15(%rdi)
+; CHECK-SSE1-NEXT:    movb %al, 14(%rdi)
+; CHECK-SSE1-NEXT:    movb %bl, 13(%rdi)
+; CHECK-SSE1-NEXT:    movb %bpl, 12(%rdi)
+; CHECK-SSE1-NEXT:    movb %r14b, 11(%rdi)
+; CHECK-SSE1-NEXT:    movb %r15b, 10(%rdi)
+; CHECK-SSE1-NEXT:    movb %r12b, 9(%rdi)
+; CHECK-SSE1-NEXT:    movb %r13b, 8(%rdi)
+; CHECK-SSE1-NEXT:    movb %r11b, 7(%rdi)
+; CHECK-SSE1-NEXT:    movb %r10b, 6(%rdi)
+; CHECK-SSE1-NEXT:    movb %dl, 5(%rdi)
+; CHECK-SSE1-NEXT:    movb %r9b, 4(%rdi)
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-SSE1-NEXT:    xorb %al, %r8b
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %r8b
 ; CHECK-SSE1-NEXT:    xorb %al, %r8b
-; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdx)
-; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, 2(%rdx)
+; CHECK-SSE1-NEXT:    movb %r8b, 3(%rdi)
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, 1(%rdx)
+; CHECK-SSE1-NEXT:    movb %cl, 2(%rdi)
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, (%rdx)
-; CHECK-SSE1-NEXT:    movq %rdx, %rax
+; CHECK-SSE1-NEXT:    movb %cl, 1(%rdi)
+; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; CHECK-SSE1-NEXT:    xorb %al, %sil
+; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
+; CHECK-SSE1-NEXT:    xorb %al, %sil
+; CHECK-SSE1-NEXT:    movb %sil, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -3231,10 +3211,8 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r12
-; CHECK-BASELINE-NEXT:    movq %rdx, %r15
-; CHECK-BASELINE-NEXT:    movq %rsi, %r14
-; CHECK-BASELINE-NEXT:    movq %rdi, %r13
+; CHECK-BASELINE-NEXT:    movq %rcx, %r8
+; CHECK-BASELINE-NEXT:    movq %rdi, %r9
 ; CHECK-BASELINE-NEXT:    movzbl 15(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movzbl 14(%rdx), %eax
@@ -3247,241 +3225,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-BASELINE-NEXT:    movzbl 10(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 9(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzbl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzbl 9(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movzbl 8(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzbl 7(%rdx), %r12d
 ; CHECK-BASELINE-NEXT:    movzbl 6(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %edi
-; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %esi
-; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movzbl 2(%rdx), %ecx
-; CHECK-BASELINE-NEXT:    movzbl (%rdx), %r11d
-; CHECK-BASELINE-NEXT:    movzbl 1(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzbl (%r14), %ebx
-; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
-; CHECK-BASELINE-NEXT:    andb (%r12), %bl
-; CHECK-BASELINE-NEXT:    xorb %r11b, %bl
-; CHECK-BASELINE-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 1(%r14), %r11d
-; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
-; CHECK-BASELINE-NEXT:    andb 1(%r12), %r11b
-; CHECK-BASELINE-NEXT:    xorb %dl, %r11b
-; CHECK-BASELINE-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 2(%r14), %edx
-; CHECK-BASELINE-NEXT:    xorb %cl, %dl
-; CHECK-BASELINE-NEXT:    andb 2(%r12), %dl
-; CHECK-BASELINE-NEXT:    xorb %cl, %dl
-; CHECK-BASELINE-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 3(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 5(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzbl 4(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movzbl 3(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 2(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzbl (%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movzbl 1(%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movzbl (%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorb %r13b, %r10b
+; CHECK-BASELINE-NEXT:    andb (%r8), %r10b
+; CHECK-BASELINE-NEXT:    xorb %r13b, %r10b
+; CHECK-BASELINE-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 1(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorb %cl, %r10b
+; CHECK-BASELINE-NEXT:    andb 1(%r8), %r10b
+; CHECK-BASELINE-NEXT:    xorb %cl, %r10b
+; CHECK-BASELINE-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 2(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 3(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 2(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 4(%r14), %eax
-; CHECK-BASELINE-NEXT:    xorb %sil, %al
-; CHECK-BASELINE-NEXT:    andb 4(%r12), %al
-; CHECK-BASELINE-NEXT:    xorb %sil, %al
-; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 5(%r14), %eax
+; CHECK-BASELINE-NEXT:    movzbl 3(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %dil, %al
-; CHECK-BASELINE-NEXT:    andb 5(%r12), %al
+; CHECK-BASELINE-NEXT:    andb 3(%r8), %al
 ; CHECK-BASELINE-NEXT:    xorb %dil, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 6(%r14), %eax
+; CHECK-BASELINE-NEXT:    movzbl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    andb 4(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %r11b, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 5(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    andb 5(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %bl, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 6(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorb %bpl, %al
-; CHECK-BASELINE-NEXT:    andb 6(%r12), %al
+; CHECK-BASELINE-NEXT:    andb 6(%r8), %al
 ; CHECK-BASELINE-NEXT:    xorb %bpl, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 7(%r14), %eax
-; CHECK-BASELINE-NEXT:    xorb %r10b, %al
-; CHECK-BASELINE-NEXT:    andb 7(%r12), %al
-; CHECK-BASELINE-NEXT:    xorb %r10b, %al
+; CHECK-BASELINE-NEXT:    movzbl 7(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r12b, %al
+; CHECK-BASELINE-NEXT:    andb 7(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %r12b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 8(%r14), %eax
-; CHECK-BASELINE-NEXT:    xorb %r9b, %al
-; CHECK-BASELINE-NEXT:    andb 8(%r12), %al
-; CHECK-BASELINE-NEXT:    xorb %r9b, %al
+; CHECK-BASELINE-NEXT:    movzbl 8(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r15b, %al
+; CHECK-BASELINE-NEXT:    andb 8(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %r15b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 9(%r14), %eax
-; CHECK-BASELINE-NEXT:    xorb %r8b, %al
-; CHECK-BASELINE-NEXT:    andb 9(%r12), %al
-; CHECK-BASELINE-NEXT:    xorb %r8b, %al
+; CHECK-BASELINE-NEXT:    movzbl 9(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    andb 9(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
 ; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 10(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 10(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 10(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 10(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 11(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 11(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 11(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 11(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 12(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 12(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 12(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 12(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 13(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 13(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 13(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 13(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 14(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 14(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 14(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 14(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 15(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 15(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 15(%r12), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 16(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 16(%r14), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 16(%r12), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 17(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 17(%r14), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 17(%r12), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 18(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 18(%r14), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 18(%r12), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 19(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 19(%r14), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 19(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 15(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
 ; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 20(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 20(%r14), %ecx
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 20(%r12), %cl
-; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT:    movzbl 21(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 21(%r14), %ebp
-; CHECK-BASELINE-NEXT:    xorb %al, %bpl
-; CHECK-BASELINE-NEXT:    andb 21(%r12), %bpl
-; CHECK-BASELINE-NEXT:    xorb %al, %bpl
-; CHECK-BASELINE-NEXT:    movzbl 22(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 22(%r14), %ebx
-; CHECK-BASELINE-NEXT:    xorb %al, %bl
-; CHECK-BASELINE-NEXT:    andb 22(%r12), %bl
-; CHECK-BASELINE-NEXT:    xorb %al, %bl
-; CHECK-BASELINE-NEXT:    movzbl 23(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 23(%r14), %r11d
-; CHECK-BASELINE-NEXT:    xorb %al, %r11b
-; CHECK-BASELINE-NEXT:    andb 23(%r12), %r11b
-; CHECK-BASELINE-NEXT:    xorb %al, %r11b
-; CHECK-BASELINE-NEXT:    movzbl 24(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 24(%r14), %r9d
-; CHECK-BASELINE-NEXT:    xorb %al, %r9b
-; CHECK-BASELINE-NEXT:    andb 24(%r12), %r9b
-; CHECK-BASELINE-NEXT:    xorb %al, %r9b
-; CHECK-BASELINE-NEXT:    movzbl 25(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 25(%r14), %r8d
-; CHECK-BASELINE-NEXT:    xorb %al, %r8b
-; CHECK-BASELINE-NEXT:    andb 25(%r12), %r8b
-; CHECK-BASELINE-NEXT:    xorb %al, %r8b
-; CHECK-BASELINE-NEXT:    movzbl 26(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 26(%r14), %edi
+; CHECK-BASELINE-NEXT:    movzbl 16(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 16(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 16(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 17(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 17(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 17(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 18(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 18(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 18(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 19(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 19(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 19(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 20(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 20(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    andb 20(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %dil, %al
+; CHECK-BASELINE-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT:    movzbl 21(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 21(%rsi), %r13d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r13b
+; CHECK-BASELINE-NEXT:    andb 21(%r8), %r13b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r13b
+; CHECK-BASELINE-NEXT:    movzbl 22(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 22(%rsi), %r12d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r12b
+; CHECK-BASELINE-NEXT:    andb 22(%r8), %r12b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r12b
+; CHECK-BASELINE-NEXT:    movzbl 23(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 23(%rsi), %r15d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r15b
+; CHECK-BASELINE-NEXT:    andb 23(%r8), %r15b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r15b
+; CHECK-BASELINE-NEXT:    movzbl 24(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 24(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorb %dil, %bpl
+; CHECK-BASELINE-NEXT:    andb 24(%r8), %bpl
+; CHECK-BASELINE-NEXT:    xorb %dil, %bpl
+; CHECK-BASELINE-NEXT:    movzbl 25(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 25(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorb %dil, %bl
+; CHECK-BASELINE-NEXT:    andb 25(%r8), %bl
+; CHECK-BASELINE-NEXT:    xorb %dil, %bl
+; CHECK-BASELINE-NEXT:    movzbl 26(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 26(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r11b
+; CHECK-BASELINE-NEXT:    andb 26(%r8), %r11b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r11b
+; CHECK-BASELINE-NEXT:    movzbl 27(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movzbl 27(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
+; CHECK-BASELINE-NEXT:    andb 27(%r8), %r10b
+; CHECK-BASELINE-NEXT:    xorb %dil, %r10b
+; CHECK-BASELINE-NEXT:    movzbl 28(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzbl 28(%rsi), %edi
 ; CHECK-BASELINE-NEXT:    xorb %al, %dil
-; CHECK-BASELINE-NEXT:    andb 26(%r12), %dil
+; CHECK-BASELINE-NEXT:    andb 28(%r8), %dil
 ; CHECK-BASELINE-NEXT:    xorb %al, %dil
-; CHECK-BASELINE-NEXT:    movzbl 27(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 27(%r14), %esi
-; CHECK-BASELINE-NEXT:    xorb %al, %sil
-; CHECK-BASELINE-NEXT:    andb 27(%r12), %sil
-; CHECK-BASELINE-NEXT:    xorb %al, %sil
-; CHECK-BASELINE-NEXT:    movzbl 28(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 28(%r14), %edx
-; CHECK-BASELINE-NEXT:    xorb %al, %dl
-; CHECK-BASELINE-NEXT:    andb 28(%r12), %dl
-; CHECK-BASELINE-NEXT:    xorb %al, %dl
-; CHECK-BASELINE-NEXT:    movzbl 29(%r15), %eax
-; CHECK-BASELINE-NEXT:    movzbl 29(%r14), %ecx
+; CHECK-BASELINE-NEXT:    movzbl 29(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzbl 29(%rsi), %ecx
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    andb 29(%r12), %cl
+; CHECK-BASELINE-NEXT:    andb 29(%r8), %cl
 ; CHECK-BASELINE-NEXT:    xorb %al, %cl
-; CHECK-BASELINE-NEXT:    movzbl 30(%r15), %r10d
-; CHECK-BASELINE-NEXT:    movzbl 30(%r14), %eax
-; CHECK-BASELINE-NEXT:    xorb %r10b, %al
-; CHECK-BASELINE-NEXT:    andb 30(%r12), %al
-; CHECK-BASELINE-NEXT:    xorb %r10b, %al
-; CHECK-BASELINE-NEXT:    movzbl 31(%r15), %r10d
-; CHECK-BASELINE-NEXT:    movzbl 31(%r14), %r14d
-; CHECK-BASELINE-NEXT:    xorb %r10b, %r14b
-; CHECK-BASELINE-NEXT:    andb 31(%r12), %r14b
-; CHECK-BASELINE-NEXT:    xorb %r10b, %r14b
-; CHECK-BASELINE-NEXT:    movb %r14b, 31(%r13)
-; CHECK-BASELINE-NEXT:    movb %al, 30(%r13)
-; CHECK-BASELINE-NEXT:    movb %cl, 29(%r13)
-; CHECK-BASELINE-NEXT:    movb %dl, 28(%r13)
-; CHECK-BASELINE-NEXT:    movb %sil, 27(%r13)
-; CHECK-BASELINE-NEXT:    movb %dil, 26(%r13)
-; CHECK-BASELINE-NEXT:    movb %r8b, 25(%r13)
-; CHECK-BASELINE-NEXT:    movb %r9b, 24(%r13)
-; CHECK-BASELINE-NEXT:    movb %r11b, 23(%r13)
-; CHECK-BASELINE-NEXT:    movb %bl, 22(%r13)
-; CHECK-BASELINE-NEXT:    movb %bpl, 21(%r13)
+; CHECK-BASELINE-NEXT:    movzbl 30(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movzbl 30(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    andb 30(%r8), %al
+; CHECK-BASELINE-NEXT:    xorb %r14b, %al
+; CHECK-BASELINE-NEXT:    movzbl 31(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzbl 31(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorb %dl, %sil
+; CHECK-BASELINE-NEXT:    andb 31(%r8), %sil
+; CHECK-BASELINE-NEXT:    xorb %dl, %sil
+; CHECK-BASELINE-NEXT:    movb %sil, 31(%r9)
+; CHECK-BASELINE-NEXT:    movb %al, 30(%r9)
+; CHECK-BASELINE-NEXT:    movb %cl, 29(%r9)
+; CHECK-BASELINE-NEXT:    movb %dil, 28(%r9)
+; CHECK-BASELINE-NEXT:    movb %r10b, 27(%r9)
+; CHECK-BASELINE-NEXT:    movb %r11b, 26(%r9)
+; CHECK-BASELINE-NEXT:    movb %bl, 25(%r9)
+; CHECK-BASELINE-NEXT:    movb %bpl, 24(%r9)
+; CHECK-BASELINE-NEXT:    movb %r15b, 23(%r9)
+; CHECK-BASELINE-NEXT:    movb %r12b, 22(%r9)
+; CHECK-BASELINE-NEXT:    movb %r13b, 21(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 20(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 20(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 19(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 19(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 18(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 18(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 17(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 17(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 16(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 16(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 15(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 15(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 14(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 14(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 13(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 13(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 12(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 12(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 11(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 11(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 10(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 10(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 9(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 9(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 8(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 8(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 7(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 7(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 6(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 6(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 5(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 5(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 4(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 4(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 3(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 3(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 2(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 2(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, 1(%r13)
+; CHECK-BASELINE-NEXT:    movb %al, 1(%r9)
 ; CHECK-BASELINE-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movb %al, (%r13)
-; CHECK-BASELINE-NEXT:    movq %r13, %rax
+; CHECK-BASELINE-NEXT:    movb %al, (%r9)
+; CHECK-BASELINE-NEXT:    movq %r9, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -3498,10 +3476,8 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r12
-; CHECK-SSE1-NEXT:    movq %rdx, %r15
-; CHECK-SSE1-NEXT:    movq %rsi, %r14
-; CHECK-SSE1-NEXT:    movq %rdi, %r13
+; CHECK-SSE1-NEXT:    movq %rcx, %r8
+; CHECK-SSE1-NEXT:    movq %rdi, %r9
 ; CHECK-SSE1-NEXT:    movzbl 15(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movzbl 14(%rdx), %eax
@@ -3514,241 +3490,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; CHECK-SSE1-NEXT:    movzbl 10(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 9(%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzbl 8(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzbl 9(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movzbl 8(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzbl 7(%rdx), %r12d
 ; CHECK-SSE1-NEXT:    movzbl 6(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %edi
-; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %esi
-; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %eax
-; CHECK-SSE1-NEXT:    movzbl 2(%rdx), %ecx
-; CHECK-SSE1-NEXT:    movzbl (%rdx), %r11d
-; CHECK-SSE1-NEXT:    movzbl 1(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzbl (%r14), %ebx
-; CHECK-SSE1-NEXT:    xorb %r11b, %bl
-; CHECK-SSE1-NEXT:    andb (%r12), %bl
-; CHECK-SSE1-NEXT:    xorb %r11b, %bl
-; CHECK-SSE1-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 1(%r14), %r11d
-; CHECK-SSE1-NEXT:    xorb %dl, %r11b
-; CHECK-SSE1-NEXT:    andb 1(%r12), %r11b
-; CHECK-SSE1-NEXT:    xorb %dl, %r11b
-; CHECK-SSE1-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 2(%r14), %edx
-; CHECK-SSE1-NEXT:    xorb %cl, %dl
-; CHECK-SSE1-NEXT:    andb 2(%r12), %dl
-; CHECK-SSE1-NEXT:    xorb %cl, %dl
-; CHECK-SSE1-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 3(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 5(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzbl 4(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movzbl 3(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 2(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzbl (%rdx), %r13d
+; CHECK-SSE1-NEXT:    movzbl 1(%rdx), %ecx
+; CHECK-SSE1-NEXT:    movzbl (%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorb %r13b, %r10b
+; CHECK-SSE1-NEXT:    andb (%r8), %r10b
+; CHECK-SSE1-NEXT:    xorb %r13b, %r10b
+; CHECK-SSE1-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 1(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorb %cl, %r10b
+; CHECK-SSE1-NEXT:    andb 1(%r8), %r10b
+; CHECK-SSE1-NEXT:    xorb %cl, %r10b
+; CHECK-SSE1-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 2(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 3(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 2(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 4(%r14), %eax
-; CHECK-SSE1-NEXT:    xorb %sil, %al
-; CHECK-SSE1-NEXT:    andb 4(%r12), %al
-; CHECK-SSE1-NEXT:    xorb %sil, %al
-; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 5(%r14), %eax
+; CHECK-SSE1-NEXT:    movzbl 3(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %dil, %al
-; CHECK-SSE1-NEXT:    andb 5(%r12), %al
+; CHECK-SSE1-NEXT:    andb 3(%r8), %al
 ; CHECK-SSE1-NEXT:    xorb %dil, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 6(%r14), %eax
+; CHECK-SSE1-NEXT:    movzbl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    andb 4(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %r11b, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 5(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    andb 5(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %bl, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 6(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorb %bpl, %al
-; CHECK-SSE1-NEXT:    andb 6(%r12), %al
+; CHECK-SSE1-NEXT:    andb 6(%r8), %al
 ; CHECK-SSE1-NEXT:    xorb %bpl, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 7(%r14), %eax
-; CHECK-SSE1-NEXT:    xorb %r10b, %al
-; CHECK-SSE1-NEXT:    andb 7(%r12), %al
-; CHECK-SSE1-NEXT:    xorb %r10b, %al
+; CHECK-SSE1-NEXT:    movzbl 7(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r12b, %al
+; CHECK-SSE1-NEXT:    andb 7(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %r12b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 8(%r14), %eax
-; CHECK-SSE1-NEXT:    xorb %r9b, %al
-; CHECK-SSE1-NEXT:    andb 8(%r12), %al
-; CHECK-SSE1-NEXT:    xorb %r9b, %al
+; CHECK-SSE1-NEXT:    movzbl 8(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r15b, %al
+; CHECK-SSE1-NEXT:    andb 8(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %r15b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 9(%r14), %eax
-; CHECK-SSE1-NEXT:    xorb %r8b, %al
-; CHECK-SSE1-NEXT:    andb 9(%r12), %al
-; CHECK-SSE1-NEXT:    xorb %r8b, %al
+; CHECK-SSE1-NEXT:    movzbl 9(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    andb 9(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
 ; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 10(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 10(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 10(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 10(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 11(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 11(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 11(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 11(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 12(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 12(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 12(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 12(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 13(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 13(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 13(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 13(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 14(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 14(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 14(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 14(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 15(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 15(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 15(%r12), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 16(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 16(%r14), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 16(%r12), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 17(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 17(%r14), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 17(%r12), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 18(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 18(%r14), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 18(%r12), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 19(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 19(%r14), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 19(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 15(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
 ; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 20(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 20(%r14), %ecx
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 20(%r12), %cl
-; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT:    movzbl 21(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 21(%r14), %ebp
-; CHECK-SSE1-NEXT:    xorb %al, %bpl
-; CHECK-SSE1-NEXT:    andb 21(%r12), %bpl
-; CHECK-SSE1-NEXT:    xorb %al, %bpl
-; CHECK-SSE1-NEXT:    movzbl 22(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 22(%r14), %ebx
-; CHECK-SSE1-NEXT:    xorb %al, %bl
-; CHECK-SSE1-NEXT:    andb 22(%r12), %bl
-; CHECK-SSE1-NEXT:    xorb %al, %bl
-; CHECK-SSE1-NEXT:    movzbl 23(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 23(%r14), %r11d
-; CHECK-SSE1-NEXT:    xorb %al, %r11b
-; CHECK-SSE1-NEXT:    andb 23(%r12), %r11b
-; CHECK-SSE1-NEXT:    xorb %al, %r11b
-; CHECK-SSE1-NEXT:    movzbl 24(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 24(%r14), %r9d
-; CHECK-SSE1-NEXT:    xorb %al, %r9b
-; CHECK-SSE1-NEXT:    andb 24(%r12), %r9b
-; CHECK-SSE1-NEXT:    xorb %al, %r9b
-; CHECK-SSE1-NEXT:    movzbl 25(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 25(%r14), %r8d
-; CHECK-SSE1-NEXT:    xorb %al, %r8b
-; CHECK-SSE1-NEXT:    andb 25(%r12), %r8b
-; CHECK-SSE1-NEXT:    xorb %al, %r8b
-; CHECK-SSE1-NEXT:    movzbl 26(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 26(%r14), %edi
+; CHECK-SSE1-NEXT:    movzbl 16(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 16(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 16(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 17(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 17(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 17(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 18(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 18(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 18(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 19(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 19(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 19(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 20(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 20(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    andb 20(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %dil, %al
+; CHECK-SSE1-NEXT:    movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT:    movzbl 21(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 21(%rsi), %r13d
+; CHECK-SSE1-NEXT:    xorb %dil, %r13b
+; CHECK-SSE1-NEXT:    andb 21(%r8), %r13b
+; CHECK-SSE1-NEXT:    xorb %dil, %r13b
+; CHECK-SSE1-NEXT:    movzbl 22(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 22(%rsi), %r12d
+; CHECK-SSE1-NEXT:    xorb %dil, %r12b
+; CHECK-SSE1-NEXT:    andb 22(%r8), %r12b
+; CHECK-SSE1-NEXT:    xorb %dil, %r12b
+; CHECK-SSE1-NEXT:    movzbl 23(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 23(%rsi), %r15d
+; CHECK-SSE1-NEXT:    xorb %dil, %r15b
+; CHECK-SSE1-NEXT:    andb 23(%r8), %r15b
+; CHECK-SSE1-NEXT:    xorb %dil, %r15b
+; CHECK-SSE1-NEXT:    movzbl 24(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 24(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorb %dil, %bpl
+; CHECK-SSE1-NEXT:    andb 24(%r8), %bpl
+; CHECK-SSE1-NEXT:    xorb %dil, %bpl
+; CHECK-SSE1-NEXT:    movzbl 25(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 25(%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorb %dil, %bl
+; CHECK-SSE1-NEXT:    andb 25(%r8), %bl
+; CHECK-SSE1-NEXT:    xorb %dil, %bl
+; CHECK-SSE1-NEXT:    movzbl 26(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 26(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorb %dil, %r11b
+; CHECK-SSE1-NEXT:    andb 26(%r8), %r11b
+; CHECK-SSE1-NEXT:    xorb %dil, %r11b
+; CHECK-SSE1-NEXT:    movzbl 27(%rdx), %edi
+; CHECK-SSE1-NEXT:    movzbl 27(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorb %dil, %r10b
+; CHECK-SSE1-NEXT:    andb 27(%r8), %r10b
+; CHECK-SSE1-NEXT:    xorb %dil, %r10b
+; CHECK-SSE1-NEXT:    movzbl 28(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzbl 28(%rsi), %edi
 ; CHECK-SSE1-NEXT:    xorb %al, %dil
-; CHECK-SSE1-NEXT:    andb 26(%r12), %dil
+; CHECK-SSE1-NEXT:    andb 28(%r8), %dil
 ; CHECK-SSE1-NEXT:    xorb %al, %dil
-; CHECK-SSE1-NEXT:    movzbl 27(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 27(%r14), %esi
-; CHECK-SSE1-NEXT:    xorb %al, %sil
-; CHECK-SSE1-NEXT:    andb 27(%r12), %sil
-; CHECK-SSE1-NEXT:    xorb %al, %sil
-; CHECK-SSE1-NEXT:    movzbl 28(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 28(%r14), %edx
-; CHECK-SSE1-NEXT:    xorb %al, %dl
-; CHECK-SSE1-NEXT:    andb 28(%r12), %dl
-; CHECK-SSE1-NEXT:    xorb %al, %dl
-; CHECK-SSE1-NEXT:    movzbl 29(%r15), %eax
-; CHECK-SSE1-NEXT:    movzbl 29(%r14), %ecx
+; CHECK-SSE1-NEXT:    movzbl 29(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzbl 29(%rsi), %ecx
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    andb 29(%r12), %cl
+; CHECK-SSE1-NEXT:    andb 29(%r8), %cl
 ; CHECK-SSE1-NEXT:    xorb %al, %cl
-; CHECK-SSE1-NEXT:    movzbl 30(%r15), %r10d
-; CHECK-SSE1-NEXT:    movzbl 30(%r14), %eax
-; CHECK-SSE1-NEXT:    xorb %r10b, %al
-; CHECK-SSE1-NEXT:    andb 30(%r12), %al
-; CHECK-SSE1-NEXT:    xorb %r10b, %al
-; CHECK-SSE1-NEXT:    movzbl 31(%r15), %r10d
-; CHECK-SSE1-NEXT:    movzbl 31(%r14), %r14d
-; CHECK-SSE1-NEXT:    xorb %r10b, %r14b
-; CHECK-SSE1-NEXT:    andb 31(%r12), %r14b
-; CHECK-SSE1-NEXT:    xorb %r10b, %r14b
-; CHECK-SSE1-NEXT:    movb %r14b, 31(%r13)
-; CHECK-SSE1-NEXT:    movb %al, 30(%r13)
-; CHECK-SSE1-NEXT:    movb %cl, 29(%r13)
-; CHECK-SSE1-NEXT:    movb %dl, 28(%r13)
-; CHECK-SSE1-NEXT:    movb %sil, 27(%r13)
-; CHECK-SSE1-NEXT:    movb %dil, 26(%r13)
-; CHECK-SSE1-NEXT:    movb %r8b, 25(%r13)
-; CHECK-SSE1-NEXT:    movb %r9b, 24(%r13)
-; CHECK-SSE1-NEXT:    movb %r11b, 23(%r13)
-; CHECK-SSE1-NEXT:    movb %bl, 22(%r13)
-; CHECK-SSE1-NEXT:    movb %bpl, 21(%r13)
+; CHECK-SSE1-NEXT:    movzbl 30(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movzbl 30(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    andb 30(%r8), %al
+; CHECK-SSE1-NEXT:    xorb %r14b, %al
+; CHECK-SSE1-NEXT:    movzbl 31(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzbl 31(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorb %dl, %sil
+; CHECK-SSE1-NEXT:    andb 31(%r8), %sil
+; CHECK-SSE1-NEXT:    xorb %dl, %sil
+; CHECK-SSE1-NEXT:    movb %sil, 31(%r9)
+; CHECK-SSE1-NEXT:    movb %al, 30(%r9)
+; CHECK-SSE1-NEXT:    movb %cl, 29(%r9)
+; CHECK-SSE1-NEXT:    movb %dil, 28(%r9)
+; CHECK-SSE1-NEXT:    movb %r10b, 27(%r9)
+; CHECK-SSE1-NEXT:    movb %r11b, 26(%r9)
+; CHECK-SSE1-NEXT:    movb %bl, 25(%r9)
+; CHECK-SSE1-NEXT:    movb %bpl, 24(%r9)
+; CHECK-SSE1-NEXT:    movb %r15b, 23(%r9)
+; CHECK-SSE1-NEXT:    movb %r12b, 22(%r9)
+; CHECK-SSE1-NEXT:    movb %r13b, 21(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 20(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 20(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 19(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 19(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 18(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 18(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 17(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 17(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 16(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 16(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 15(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 15(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 14(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 14(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 13(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 13(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 12(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 12(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 11(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 11(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 10(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 10(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 9(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 9(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 8(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 8(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 7(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 7(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 6(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 6(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 5(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 5(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 4(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 4(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 3(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 3(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 2(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 2(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, 1(%r13)
+; CHECK-SSE1-NEXT:    movb %al, 1(%r9)
 ; CHECK-SSE1-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT:    movb %al, (%r13)
-; CHECK-SSE1-NEXT:    movq %r13, %rax
+; CHECK-SSE1-NEXT:    movb %al, (%r9)
+; CHECK-SSE1-NEXT:    movq %r9, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -3795,20 +3771,20 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r9
-; CHECK-BASELINE-NEXT:    movq %rdi, %r10
+; CHECK-BASELINE-NEXT:    movq %rcx, %rax
+; CHECK-BASELINE-NEXT:    movq %rdi, %r8
 ; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edi
 ; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl 28(%rdx), %edi
 ; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %edi
 ; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 24(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 24(%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r11d
 ; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl 16(%rdx), %ebx
@@ -3825,82 +3801,77 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edi
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %edi
 ; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl (%rsi), %edx
 ; CHECK-BASELINE-NEXT:    xorw %cx, %dx
 ; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %ecx
-; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    xorw %di, %cx
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %di, %ax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %ecx
-; CHECK-BASELINE-NEXT:    xorw %r13w, %cx
-; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
-; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %edx
-; CHECK-BASELINE-NEXT:    xorw %r14w, %dx
-; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r13d
-; CHECK-BASELINE-NEXT:    xorw %bp, %r13w
-; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %r12d
-; CHECK-BASELINE-NEXT:    xorw %bx, %r12w
-; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %r15d
-; CHECK-BASELINE-NEXT:    xorw %r11w, %r15w
-; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %r14d
-; CHECK-BASELINE-NEXT:    xorw %r8w, %r14w
-; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %ebp
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %ebx
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %edi
+; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %ecx
+; CHECK-BASELINE-NEXT:    xorw %r9w, %cx
+; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %r13w, %dx
+; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw %r12w, %dx
+; CHECK-BASELINE-NEXT:    movl %edx, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %r12d
+; CHECK-BASELINE-NEXT:    xorw %r15w, %r12w
+; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %r15d
+; CHECK-BASELINE-NEXT:    xorw %r14w, %r15w
+; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %r14d
+; CHECK-BASELINE-NEXT:    xorw %bp, %r14w
+; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %ebp
+; CHECK-BASELINE-NEXT:    xorw %bx, %bp
+; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
+; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %r9d
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %edi
 ; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %edx
+; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
 ; CHECK-BASELINE-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
-; CHECK-BASELINE-NEXT:    andw 30(%r9), %si
-; CHECK-BASELINE-NEXT:    andw 28(%r9), %di
-; CHECK-BASELINE-NEXT:    andw 26(%r9), %r11w
-; CHECK-BASELINE-NEXT:    andw 24(%r9), %bx
-; CHECK-BASELINE-NEXT:    andw 22(%r9), %bp
-; CHECK-BASELINE-NEXT:    andw 20(%r9), %r14w
-; CHECK-BASELINE-NEXT:    andw 18(%r9), %r15w
-; CHECK-BASELINE-NEXT:    andw 16(%r9), %r12w
-; CHECK-BASELINE-NEXT:    andw 14(%r9), %r13w
-; CHECK-BASELINE-NEXT:    andw 12(%r9), %dx
-; CHECK-BASELINE-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    andw 10(%r9), %ax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw 8(%r9), %dx
-; CHECK-BASELINE-NEXT:    andw 6(%r9), %cx
+; CHECK-BASELINE-NEXT:    andw 30(%rax), %si
+; CHECK-BASELINE-NEXT:    andw 28(%rax), %dx
+; CHECK-BASELINE-NEXT:    andw 26(%rax), %di
+; CHECK-BASELINE-NEXT:    andw 24(%rax), %r9w
+; CHECK-BASELINE-NEXT:    andw 22(%rax), %r10w
+; CHECK-BASELINE-NEXT:    andw 20(%rax), %r11w
+; CHECK-BASELINE-NEXT:    andw 18(%rax), %bx
+; CHECK-BASELINE-NEXT:    andw 16(%rax), %bp
+; CHECK-BASELINE-NEXT:    andw 14(%rax), %r14w
+; CHECK-BASELINE-NEXT:    andw 12(%rax), %r15w
+; CHECK-BASELINE-NEXT:    andw 10(%rax), %r12w
+; CHECK-BASELINE-NEXT:    andw 8(%rax), %r13w
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 6(%rax), %r13w
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    andw 4(%rax), %cx
 ; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw 4(%r9), %r8w
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw 2(%r9), %ax
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-BASELINE-NEXT:    andw 2(%rax), %r13w
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    andw (%r9), %cx
+; CHECK-BASELINE-NEXT:    andw (%rax), %cx
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl %edx, %ecx
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movl %r13d, %eax
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-BASELINE-NEXT:    xorl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
@@ -3908,27 +3879,30 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    movw %si, 30(%r10)
-; CHECK-BASELINE-NEXT:    movw %di, 28(%r10)
-; CHECK-BASELINE-NEXT:    movw %r11w, 26(%r10)
-; CHECK-BASELINE-NEXT:    movw %bx, 24(%r10)
-; CHECK-BASELINE-NEXT:    movw %bp, 22(%r10)
-; CHECK-BASELINE-NEXT:    movw %r14w, 20(%r10)
-; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r10)
-; CHECK-BASELINE-NEXT:    movw %r12w, 16(%r10)
-; CHECK-BASELINE-NEXT:    movw %r13w, 14(%r10)
-; CHECK-BASELINE-NEXT:    movw %ax, 12(%r10)
-; CHECK-BASELINE-NEXT:    movw %dx, 10(%r10)
-; CHECK-BASELINE-NEXT:    movw %cx, 8(%r10)
-; CHECK-BASELINE-NEXT:    movw %r9w, 6(%r10)
-; CHECK-BASELINE-NEXT:    movw %r8w, 4(%r10)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 2(%r10)
-; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%r10)
-; CHECK-BASELINE-NEXT:    movq %r10, %rax
+; CHECK-BASELINE-NEXT:    movw %si, 30(%r8)
+; CHECK-BASELINE-NEXT:    movw %dx, 28(%r8)
+; CHECK-BASELINE-NEXT:    movw %di, 26(%r8)
+; CHECK-BASELINE-NEXT:    movw %r9w, 24(%r8)
+; CHECK-BASELINE-NEXT:    movw %r10w, 22(%r8)
+; CHECK-BASELINE-NEXT:    movw %r11w, 20(%r8)
+; CHECK-BASELINE-NEXT:    movw %bx, 18(%r8)
+; CHECK-BASELINE-NEXT:    movw %bp, 16(%r8)
+; CHECK-BASELINE-NEXT:    movw %r14w, 14(%r8)
+; CHECK-BASELINE-NEXT:    movw %r15w, 12(%r8)
+; CHECK-BASELINE-NEXT:    movw %r12w, 10(%r8)
+; CHECK-BASELINE-NEXT:    movw %r13w, 8(%r8)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %dx, 6(%r8)
+; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-BASELINE-NEXT:    movw %dx, 4(%r8)
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%r8)
+; CHECK-BASELINE-NEXT:    movw %cx, (%r8)
+; CHECK-BASELINE-NEXT:    movq %r8, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -3945,20 +3919,20 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r9
-; CHECK-SSE1-NEXT:    movq %rdi, %r10
+; CHECK-SSE1-NEXT:    movq %rcx, %rax
+; CHECK-SSE1-NEXT:    movq %rdi, %r8
 ; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edi
 ; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl 28(%rdx), %edi
 ; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %edi
 ; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 24(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 20(%rdx), %r8d
-; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 24(%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r11d
 ; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl 16(%rdx), %ebx
@@ -3975,82 +3949,77 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 4(%rdx), %edi
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %edi
 ; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl (%rsi), %edx
 ; CHECK-SSE1-NEXT:    xorw %cx, %dx
 ; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %ecx
-; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    xorw %di, %cx
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %di, %ax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %ecx
-; CHECK-SSE1-NEXT:    xorw %r13w, %cx
-; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r12w, %ax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r15w, %ax
-; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %edx
-; CHECK-SSE1-NEXT:    xorw %r14w, %dx
-; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r13d
-; CHECK-SSE1-NEXT:    xorw %bp, %r13w
-; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %r12d
-; CHECK-SSE1-NEXT:    xorw %bx, %r12w
-; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %r15d
-; CHECK-SSE1-NEXT:    xorw %r11w, %r15w
-; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %r14d
-; CHECK-SSE1-NEXT:    xorw %r8w, %r14w
-; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %ebp
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bp # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %ebx
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %bx # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r11d
-; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r11w # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %edi
+; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %ecx
+; CHECK-SSE1-NEXT:    xorw %r9w, %cx
+; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %r13w, %dx
+; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw %r12w, %dx
+; CHECK-SSE1-NEXT:    movl %edx, %r13d
+; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %r12d
+; CHECK-SSE1-NEXT:    xorw %r15w, %r12w
+; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %r15d
+; CHECK-SSE1-NEXT:    xorw %r14w, %r15w
+; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %r14d
+; CHECK-SSE1-NEXT:    xorw %bp, %r14w
+; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %ebp
+; CHECK-SSE1-NEXT:    xorw %bx, %bp
+; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %ebx
+; CHECK-SSE1-NEXT:    xorw %r11w, %bx
+; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
+; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r10w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %r9d
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %r9w # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %edi
 ; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %di # 2-byte Folded Reload
+; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %edx
+; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %dx # 2-byte Folded Reload
 ; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
 ; CHECK-SSE1-NEXT:    xorw {{[-0-9]+}}(%r{{[sb]}}p), %si # 2-byte Folded Reload
-; CHECK-SSE1-NEXT:    andw 30(%r9), %si
-; CHECK-SSE1-NEXT:    andw 28(%r9), %di
-; CHECK-SSE1-NEXT:    andw 26(%r9), %r11w
-; CHECK-SSE1-NEXT:    andw 24(%r9), %bx
-; CHECK-SSE1-NEXT:    andw 22(%r9), %bp
-; CHECK-SSE1-NEXT:    andw 20(%r9), %r14w
-; CHECK-SSE1-NEXT:    andw 18(%r9), %r15w
-; CHECK-SSE1-NEXT:    andw 16(%r9), %r12w
-; CHECK-SSE1-NEXT:    andw 14(%r9), %r13w
-; CHECK-SSE1-NEXT:    andw 12(%r9), %dx
-; CHECK-SSE1-NEXT:    movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    andw 10(%r9), %ax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw 8(%r9), %dx
-; CHECK-SSE1-NEXT:    andw 6(%r9), %cx
+; CHECK-SSE1-NEXT:    andw 30(%rax), %si
+; CHECK-SSE1-NEXT:    andw 28(%rax), %dx
+; CHECK-SSE1-NEXT:    andw 26(%rax), %di
+; CHECK-SSE1-NEXT:    andw 24(%rax), %r9w
+; CHECK-SSE1-NEXT:    andw 22(%rax), %r10w
+; CHECK-SSE1-NEXT:    andw 20(%rax), %r11w
+; CHECK-SSE1-NEXT:    andw 18(%rax), %bx
+; CHECK-SSE1-NEXT:    andw 16(%rax), %bp
+; CHECK-SSE1-NEXT:    andw 14(%rax), %r14w
+; CHECK-SSE1-NEXT:    andw 12(%rax), %r15w
+; CHECK-SSE1-NEXT:    andw 10(%rax), %r12w
+; CHECK-SSE1-NEXT:    andw 8(%rax), %r13w
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 6(%rax), %r13w
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    andw 4(%rax), %cx
 ; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw 4(%r9), %r8w
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw 2(%r9), %ax
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-SSE1-NEXT:    andw 2(%rax), %r13w
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-SSE1-NEXT:    andw (%r9), %cx
+; CHECK-SSE1-NEXT:    andw (%rax), %cx
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl %edx, %ecx
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
+; CHECK-SSE1-NEXT:    movl %r13d, %eax
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
+; CHECK-SSE1-NEXT:    xorl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload
@@ -4058,27 +4027,30 @@ define <16 x i16> @in_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    movw %si, 30(%r10)
-; CHECK-SSE1-NEXT:    movw %di, 28(%r10)
-; CHECK-SSE1-NEXT:    movw %r11w, 26(%r10)
-; CHECK-SSE1-NEXT:    movw %bx, 24(%r10)
-; CHECK-SSE1-NEXT:    movw %bp, 22(%r10)
-; CHECK-SSE1-NEXT:    movw %r14w, 20(%r10)
-; CHECK-SSE1-NEXT:    movw %r15w, 18(%r10)
-; CHECK-SSE1-NEXT:    movw %r12w, 16(%r10)
-; CHECK-SSE1-NEXT:    movw %r13w, 14(%r10)
-; CHECK-SSE1-NEXT:    movw %ax, 12(%r10)
-; CHECK-SSE1-NEXT:    movw %dx, 10(%r10)
-; CHECK-SSE1-NEXT:    movw %cx, 8(%r10)
-; CHECK-SSE1-NEXT:    movw %r9w, 6(%r10)
-; CHECK-SSE1-NEXT:    movw %r8w, 4(%r10)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 2(%r10)
-; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%r10)
-; CHECK-SSE1-NEXT:    movq %r10, %rax
+; CHECK-SSE1-NEXT:    movw %si, 30(%r8)
+; CHECK-SSE1-NEXT:    movw %dx, 28(%r8)
+; CHECK-SSE1-NEXT:    movw %di, 26(%r8)
+; CHECK-SSE1-NEXT:    movw %r9w, 24(%r8)
+; CHECK-SSE1-NEXT:    movw %r10w, 22(%r8)
+; CHECK-SSE1-NEXT:    movw %r11w, 20(%r8)
+; CHECK-SSE1-NEXT:    movw %bx, 18(%r8)
+; CHECK-SSE1-NEXT:    movw %bp, 16(%r8)
+; CHECK-SSE1-NEXT:    movw %r14w, 14(%r8)
+; CHECK-SSE1-NEXT:    movw %r15w, 12(%r8)
+; CHECK-SSE1-NEXT:    movw %r12w, 10(%r8)
+; CHECK-SSE1-NEXT:    movw %r13w, 8(%r8)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %dx, 6(%r8)
+; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Reload
+; CHECK-SSE1-NEXT:    movw %dx, 4(%r8)
+; CHECK-SSE1-NEXT:    movw %ax, 2(%r8)
+; CHECK-SSE1-NEXT:    movw %cx, (%r8)
+; CHECK-SSE1-NEXT:    movq %r8, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13
@@ -4125,33 +4097,33 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movl 28(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movl 24(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    movl 20(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movl 16(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl 28(%rdx), %eax
 ; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
-; CHECK-BASELINE-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r14d
-; CHECK-BASELINE-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movl (%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movl 24(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 20(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movl (%rdx), %r10d
 ; CHECK-BASELINE-NEXT:    movl 4(%rdx), %r13d
 ; CHECK-BASELINE-NEXT:    movl (%rsi), %r8d
-; CHECK-BASELINE-NEXT:    xorl %r15d, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
 ; CHECK-BASELINE-NEXT:    movl 4(%rsi), %r9d
 ; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
 ; CHECK-BASELINE-NEXT:    movl 8(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r14d, %r11d
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r11d
 ; CHECK-BASELINE-NEXT:    movl 12(%rsi), %r14d
-; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r14d
 ; CHECK-BASELINE-NEXT:    movl 16(%rsi), %r12d
-; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r12d
 ; CHECK-BASELINE-NEXT:    movl 20(%rsi), %edx
-; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
+; CHECK-BASELINE-NEXT:    xorl %eax, %edx
 ; CHECK-BASELINE-NEXT:    movl 24(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    movl 28(%rsi), %esi
-; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    andl 28(%rcx), %esi
 ; CHECK-BASELINE-NEXT:    andl 24(%rcx), %eax
 ; CHECK-BASELINE-NEXT:    andl 20(%rcx), %edx
@@ -4160,14 +4132,14 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    andl 8(%rcx), %r11d
 ; CHECK-BASELINE-NEXT:    andl 4(%rcx), %r9d
 ; CHECK-BASELINE-NEXT:    andl (%rcx), %r8d
-; CHECK-BASELINE-NEXT:    xorl %r15d, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r10d, %r8d
 ; CHECK-BASELINE-NEXT:    xorl %r13d, %r9d
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
-; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
-; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ebp, %esi
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r11d
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r14d
+; CHECK-BASELINE-NEXT:    xorl %ebx, %r12d
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-BASELINE-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
 ; CHECK-BASELINE-NEXT:    movl %esi, 28(%rdi)
 ; CHECK-BASELINE-NEXT:    movl %eax, 24(%rdi)
 ; CHECK-BASELINE-NEXT:    movl %edx, 20(%rdi)
@@ -4193,33 +4165,33 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movl 28(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movl 24(%rdx), %ebx
-; CHECK-SSE1-NEXT:    movl 20(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movl 16(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl 28(%rdx), %eax
 ; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
-; CHECK-SSE1-NEXT:    movl %r12d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl 8(%rdx), %r14d
-; CHECK-SSE1-NEXT:    movl %r14d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movl (%rdx), %r15d
+; CHECK-SSE1-NEXT:    movl 24(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 20(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movl (%rdx), %r10d
 ; CHECK-SSE1-NEXT:    movl 4(%rdx), %r13d
 ; CHECK-SSE1-NEXT:    movl (%rsi), %r8d
-; CHECK-SSE1-NEXT:    xorl %r15d, %r8d
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
 ; CHECK-SSE1-NEXT:    movl 4(%rsi), %r9d
 ; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
 ; CHECK-SSE1-NEXT:    movl 8(%rsi), %r11d
-; CHECK-SSE1-NEXT:    xorl %r14d, %r11d
+; CHECK-SSE1-NEXT:    xorl %r15d, %r11d
 ; CHECK-SSE1-NEXT:    movl 12(%rsi), %r14d
-; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
+; CHECK-SSE1-NEXT:    xorl %ebp, %r14d
 ; CHECK-SSE1-NEXT:    movl 16(%rsi), %r12d
-; CHECK-SSE1-NEXT:    xorl %eax, %r12d
+; CHECK-SSE1-NEXT:    xorl %ebx, %r12d
 ; CHECK-SSE1-NEXT:    movl 20(%rsi), %edx
-; CHECK-SSE1-NEXT:    xorl %r10d, %edx
+; CHECK-SSE1-NEXT:    xorl %eax, %edx
 ; CHECK-SSE1-NEXT:    movl 24(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorl %ebx, %eax
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    movl 28(%rsi), %esi
-; CHECK-SSE1-NEXT:    xorl %ebp, %esi
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    andl 28(%rcx), %esi
 ; CHECK-SSE1-NEXT:    andl 24(%rcx), %eax
 ; CHECK-SSE1-NEXT:    andl 20(%rcx), %edx
@@ -4228,14 +4200,14 @@ define <8 x i32> @in_v8i32(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    andl 8(%rcx), %r11d
 ; CHECK-SSE1-NEXT:    andl 4(%rcx), %r9d
 ; CHECK-SSE1-NEXT:    andl (%rcx), %r8d
-; CHECK-SSE1-NEXT:    xorl %r15d, %r8d
+; CHECK-SSE1-NEXT:    xorl %r10d, %r8d
 ; CHECK-SSE1-NEXT:    xorl %r13d, %r9d
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 4-byte Folded Reload
-; CHECK-SSE1-NEXT:    xorl %r10d, %edx
-; CHECK-SSE1-NEXT:    xorl %ebx, %eax
-; CHECK-SSE1-NEXT:    xorl %ebp, %esi
+; CHECK-SSE1-NEXT:    xorl %r15d, %r11d
+; CHECK-SSE1-NEXT:    xorl %ebp, %r14d
+; CHECK-SSE1-NEXT:    xorl %ebx, %r12d
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload
+; CHECK-SSE1-NEXT:    xorl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 4-byte Folded Reload
 ; CHECK-SSE1-NEXT:    movl %esi, 28(%rdi)
 ; CHECK-SSE1-NEXT:    movl %eax, 24(%rdi)
 ; CHECK-SSE1-NEXT:    movl %edx, 20(%rdi)
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index ebb5e135eacd02..f8223b27af773e 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -55,10 +55,10 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    cmovnel %eax, %edx
 ; X86-NEXT:    cmovnel %ebx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testb $32, %cl
@@ -78,7 +78,7 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    shrl %cl, %edi
 ; X86-NEXT:    testb $32, %ch
 ; X86-NEXT:    cmovel %edi, %ebx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrdl %cl, %ebp, %eax
 ; X86-NEXT:    testb $32, %cl
@@ -92,19 +92,21 @@ define <2 x i64> @vec_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    cmovnel %eax, %edi
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    cmovnel %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    orl %ebp, %ebx
-; X86-NEXT:    cmovnel %ecx, %esi
-; X86-NEXT:    cmovnel %ecx, %edx
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovnel %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %esi, 8(%ecx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl %edi, (%ecx)
 ; X86-NEXT:    movl %ecx, %eax
@@ -169,7 +171,6 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
@@ -178,38 +179,40 @@ define <4 x i32> @vec_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    cmpl %ebp, %ebx
-; X86-NEXT:    movl $-1, %edx
-; X86-NEXT:    cmovnel %edx, %esi
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    cmovnel %eax, %esi
 ; X86-NEXT:    movl $-1, %ebx
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movb %ah, %cl
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, %ebp
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    cmpl %ebp, %edi
-; X86-NEXT:    cmovnel %ebx, %edx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %ch, %cl
+; X86-NEXT:    cmovnel %ebx, %eax
+; X86-NEXT:    movl $-1, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %edi
 ; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    shrl %cl, %ebp
-; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl %ebp, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovnel %eax, %edi
+; X86-NEXT:    cmovnel %edx, %edi
 ; X86-NEXT:    movl %ebx, %ebp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    shrl %cl, %eax
-; X86-NEXT:    cmpl %eax, %ebx
-; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    cmovnel %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %ebp, %edx
+; X86-NEXT:    shrl %cl, %edx
+; X86-NEXT:    cmpl %edx, %ebx
+; X86-NEXT:    movl $-1, %ecx
+; X86-NEXT:    cmovnel %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebp, 12(%ecx)
+; X86-NEXT:    movl %edi, 8(%ecx)
+; X86-NEXT:    movl %eax, 4(%ecx)
+; X86-NEXT:    movl %esi, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -487,8 +490,8 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $48, %esp
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
@@ -509,21 +512,21 @@ define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
 ; X86-NEXT:    cmpb %bl, %dh
 ; X86-NEXT:    cmovnel %esi, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movb %ch, %ah
-; X86-NEXT:    movb %dl, %cl
-; X86-NEXT:    shlb %cl, %ah
-; X86-NEXT:    movzbl %ah, %edi
-; X86-NEXT:    shrb %cl, %ah
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmpb %ah, %ch
-; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %dl, %ah
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shlb %cl, %ah
 ; X86-NEXT:    movzbl %ah, %edi
 ; X86-NEXT:    shrb %cl, %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
 ; X86-NEXT:    cmpb %ah, %dl
+; X86-NEXT:    cmovnel %esi, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movb %ch, %dl
+; X86-NEXT:    movb %al, %cl
+; X86-NEXT:    shlb %cl, %dl
+; X86-NEXT:    movzbl %dl, %edi
+; X86-NEXT:    shrb %cl, %dl
+; X86-NEXT:    cmpb %dl, %ch
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmovnel %esi, %edi
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 3c98eba69ae5bc..f05a5db0ca7b81 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -652,58 +652,57 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %ebx
 ; FALLBACK16-NEXT:    pushl %edi
 ; FALLBACK16-NEXT:    pushl %esi
-; FALLBACK16-NEXT:    subl $60, %esp
+; FALLBACK16-NEXT:    subl $44, %esp
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT:    movl (%ecx), %edx
+; FALLBACK16-NEXT:    movl (%ecx), %ebx
 ; FALLBACK16-NEXT:    movl 4(%ecx), %esi
 ; FALLBACK16-NEXT:    movl 8(%ecx), %edi
 ; FALLBACK16-NEXT:    movl 12(%ecx), %ecx
 ; FALLBACK16-NEXT:    movb (%eax), %ah
-; FALLBACK16-NEXT:    movb %ah, %al
-; FALLBACK16-NEXT:    shlb $3, %al
+; FALLBACK16-NEXT:    movb %ah, %dh
+; FALLBACK16-NEXT:    shlb $3, %dh
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, (%esp)
 ; FALLBACK16-NEXT:    andb $12, %ah
-; FALLBACK16-NEXT:    movzbl %ah, %ebp
-; FALLBACK16-NEXT:    movl 20(%esp,%ebp), %esi
-; FALLBACK16-NEXT:    movl %esi, %ebx
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    movzbl %ah, %ebx
+; FALLBACK16-NEXT:    movl 4(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movb %dh, %dl
 ; FALLBACK16-NEXT:    notb %dl
-; FALLBACK16-NEXT:    movl 24(%esp,%ebp), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT:    movl 8(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    leal (%ebp,%ebp), %edi
 ; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %ebx, %edi
-; FALLBACK16-NEXT:    movl 16(%esp,%ebp), %ebx
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl (%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
 ; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    orl %ebx, %esi
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; FALLBACK16-NEXT:    movl 28(%esp,%ebp), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 12(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
 ; FALLBACK16-NEXT:    movl %edx, %ecx
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl %ebx, 12(%edx)
-; FALLBACK16-NEXT:    movl %ebp, 8(%edx)
-; FALLBACK16-NEXT:    movl %esi, (%edx)
-; FALLBACK16-NEXT:    movl %edi, 4(%edx)
-; FALLBACK16-NEXT:    addl $60, %esp
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %ebp, %ebx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl %eax, 12(%ebp)
+; FALLBACK16-NEXT:    movl %ebx, 8(%ebp)
+; FALLBACK16-NEXT:    movl %esi, (%ebp)
+; FALLBACK16-NEXT:    movl %edi, 4(%ebp)
+; FALLBACK16-NEXT:    addl $44, %esp
 ; FALLBACK16-NEXT:    popl %esi
 ; FALLBACK16-NEXT:    popl %edi
 ; FALLBACK16-NEXT:    popl %ebx
@@ -865,50 +864,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK20-NEXT:    movups (%ecx), %xmm0
-; FALLBACK20-NEXT:    movzbl (%eax), %ecx
-; FALLBACK20-NEXT:    movl %ecx, %eax
-; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    movzbl (%eax), %eax
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    shlb $3, %ch
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    andb $12, %cl
-; FALLBACK20-NEXT:    movzbl %cl, %edi
-; FALLBACK20-NEXT:    movl 16(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl 20(%esp,%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    andb $12, %al
+; FALLBACK20-NEXT:    movzbl %al, %eax
+; FALLBACK20-NEXT:    movl 16(%esp,%eax), %ebx
+; FALLBACK20-NEXT:    movl 20(%esp,%eax), %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    notb %dl
-; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %esi
 ; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 24(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %esi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %esi
-; FALLBACK20-NEXT:    movl 28(%esp,%edi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movl 24(%esp,%eax), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %edx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 28(%esp,%eax), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %esi, %ebp
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %edx, %ebp
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edi
 ; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %esi, %ebx
+; FALLBACK20-NEXT:    orl %edi, %ebx
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    movl %edi, 12(%edx)
+; FALLBACK20-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl %eax, 12(%edx)
 ; FALLBACK20-NEXT:    movl %ebx, 4(%edx)
 ; FALLBACK20-NEXT:    movl %ebp, 8(%edx)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movl %eax, (%edx)
+; FALLBACK20-NEXT:    movl %esi, (%edx)
 ; FALLBACK20-NEXT:    addl $60, %esp
 ; FALLBACK20-NEXT:    popl %esi
 ; FALLBACK20-NEXT:    popl %edi
@@ -1055,50 +1050,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK24-NEXT:    movzbl (%eax), %ecx
-; FALLBACK24-NEXT:    movl %ecx, %eax
-; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    movzbl (%eax), %eax
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    shlb $3, %ch
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    andb $12, %cl
-; FALLBACK24-NEXT:    movzbl %cl, %edi
-; FALLBACK24-NEXT:    movl 16(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl 20(%esp,%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    andb $12, %al
+; FALLBACK24-NEXT:    movzbl %al, %eax
+; FALLBACK24-NEXT:    movl 16(%esp,%eax), %ebx
+; FALLBACK24-NEXT:    movl 20(%esp,%eax), %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    notb %dl
-; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %esi
 ; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 24(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %esi
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %esi
-; FALLBACK24-NEXT:    movl 28(%esp,%edi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movl 24(%esp,%eax), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %edx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 28(%esp,%eax), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %esi, %ebp
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %ebp
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %esi, %ebx
+; FALLBACK24-NEXT:    orl %edi, %ebx
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    movl %edi, 12(%edx)
+; FALLBACK24-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl %eax, 12(%edx)
 ; FALLBACK24-NEXT:    movl %ebx, 4(%edx)
 ; FALLBACK24-NEXT:    movl %ebp, 8(%edx)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movl %eax, (%edx)
+; FALLBACK24-NEXT:    movl %esi, (%edx)
 ; FALLBACK24-NEXT:    addl $60, %esp
 ; FALLBACK24-NEXT:    popl %esi
 ; FALLBACK24-NEXT:    popl %edi
@@ -1245,50 +1236,46 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK28-NEXT:    movzbl (%eax), %ecx
-; FALLBACK28-NEXT:    movl %ecx, %eax
-; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    movzbl (%eax), %eax
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    shlb $3, %ch
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    andb $12, %cl
-; FALLBACK28-NEXT:    movzbl %cl, %edi
-; FALLBACK28-NEXT:    movl 16(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl 20(%esp,%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    andb $12, %al
+; FALLBACK28-NEXT:    movzbl %al, %eax
+; FALLBACK28-NEXT:    movl 16(%esp,%eax), %ebx
+; FALLBACK28-NEXT:    movl 20(%esp,%eax), %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    notb %dl
-; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %esi
 ; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 24(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %esi
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %esi
-; FALLBACK28-NEXT:    movl 28(%esp,%edi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movl 24(%esp,%eax), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %edx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 28(%esp,%eax), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %esi, %ebp
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %edx, %ebp
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %esi, %ebx
+; FALLBACK28-NEXT:    orl %edi, %ebx
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    movl %edi, 12(%edx)
+; FALLBACK28-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl %eax, 12(%edx)
 ; FALLBACK28-NEXT:    movl %ebx, 4(%edx)
 ; FALLBACK28-NEXT:    movl %ebp, 8(%edx)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movl %eax, (%edx)
+; FALLBACK28-NEXT:    movl %esi, (%edx)
 ; FALLBACK28-NEXT:    addl $60, %esp
 ; FALLBACK28-NEXT:    popl %esi
 ; FALLBACK28-NEXT:    popl %edi
@@ -1670,54 +1657,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    subl $60, %esp
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT:    movl (%ecx), %ebx
+; FALLBACK16-NEXT:    movl (%ecx), %edx
 ; FALLBACK16-NEXT:    movl 4(%ecx), %esi
 ; FALLBACK16-NEXT:    movl 8(%ecx), %edi
-; FALLBACK16-NEXT:    movl 12(%ecx), %ecx
-; FALLBACK16-NEXT:    movb (%eax), %ah
-; FALLBACK16-NEXT:    movb %ah, %dh
-; FALLBACK16-NEXT:    shlb $3, %dh
+; FALLBACK16-NEXT:    movl 12(%ecx), %ebx
+; FALLBACK16-NEXT:    movzbl (%eax), %eax
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shlb $3, %cl
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    andb $12, %ah
-; FALLBACK16-NEXT:    negb %ah
-; FALLBACK16-NEXT:    movsbl %ah, %ebp
-; FALLBACK16-NEXT:    movl 32(%esp,%ebp), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $12, %al
+; FALLBACK16-NEXT:    negb %al
+; FALLBACK16-NEXT:    movsbl %al, %ebp
+; FALLBACK16-NEXT:    movl 32(%esp,%ebp), %edi
 ; FALLBACK16-NEXT:    movl 36(%esp,%ebp), %esi
-; FALLBACK16-NEXT:    movl %esi, %edi
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    movb %dh, %dl
-; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movb %cl, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movl %edi, %ebx
 ; FALLBACK16-NEXT:    shrl %ebx
-; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl 44(%esp,%ebp), %eax
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    movl 40(%esp,%ebp), %edi
-; FALLBACK16-NEXT:    movl %edi, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebx
+; FALLBACK16-NEXT:    movl 44(%esp,%ebp), %edx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 40(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %ebp
 ; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %ebp
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
 ; FALLBACK16-NEXT:    shrl %esi
-; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    movl %edx, (%eax)
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    movl %edi, (%eax)
 ; FALLBACK16-NEXT:    movl %esi, 8(%eax)
 ; FALLBACK16-NEXT:    movl %ebp, 12(%eax)
 ; FALLBACK16-NEXT:    movl %ebx, 4(%eax)
@@ -1877,56 +1864,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    pushl %ebx
 ; FALLBACK20-NEXT:    pushl %edi
 ; FALLBACK20-NEXT:    pushl %esi
-; FALLBACK20-NEXT:    subl $60, %esp
+; FALLBACK20-NEXT:    subl $44, %esp
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK20-NEXT:    movups (%ecx), %xmm0
-; FALLBACK20-NEXT:    movzbl (%eax), %ecx
-; FALLBACK20-NEXT:    movl %ecx, %eax
-; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    movzbl (%eax), %eax
+; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    shlb $3, %ch
 ; FALLBACK20-NEXT:    xorps %xmm1, %xmm1
-; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, (%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    andb $12, %cl
-; FALLBACK20-NEXT:    negb %cl
-; FALLBACK20-NEXT:    movsbl %cl, %edi
-; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    andb $12, %al
+; FALLBACK20-NEXT:    negb %al
+; FALLBACK20-NEXT:    movsbl %al, %edi
+; FALLBACK20-NEXT:    movl 28(%esp,%edi), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    movb %ch, %dl
 ; FALLBACK20-NEXT:    notb %dl
-; FALLBACK20-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movl 24(%esp,%edi), %ebp
 ; FALLBACK20-NEXT:    movl %ebp, %esi
 ; FALLBACK20-NEXT:    shrl %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %dl, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    movl 32(%esp,%edi), %ecx
-; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 36(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    movl 20(%esp,%edi), %eax
+; FALLBACK20-NEXT:    movl %eax, %edi
 ; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %dl, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
 ; FALLBACK20-NEXT:    orl %ebp, %edi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ebp
 ; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %dl, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
-; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    movl %eax, (%edx)
-; FALLBACK20-NEXT:    movl %ebp, 4(%edx)
-; FALLBACK20-NEXT:    movl %edi, 8(%edx)
-; FALLBACK20-NEXT:    movl %esi, 12(%edx)
-; FALLBACK20-NEXT:    addl $60, %esp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %ebx
+; FALLBACK20-NEXT:    movl %ebx, (%eax)
+; FALLBACK20-NEXT:    movl %ebp, 4(%eax)
+; FALLBACK20-NEXT:    movl %edi, 8(%eax)
+; FALLBACK20-NEXT:    movl %esi, 12(%eax)
+; FALLBACK20-NEXT:    addl $44, %esp
 ; FALLBACK20-NEXT:    popl %esi
 ; FALLBACK20-NEXT:    popl %edi
 ; FALLBACK20-NEXT:    popl %ebx
@@ -2069,56 +2054,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    pushl %ebx
 ; FALLBACK24-NEXT:    pushl %edi
 ; FALLBACK24-NEXT:    pushl %esi
-; FALLBACK24-NEXT:    subl $60, %esp
+; FALLBACK24-NEXT:    subl $44, %esp
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK24-NEXT:    movzbl (%eax), %ecx
-; FALLBACK24-NEXT:    movl %ecx, %eax
-; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    movzbl (%eax), %eax
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    shlb $3, %ch
 ; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm1, (%esp)
 ; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    andb $12, %cl
-; FALLBACK24-NEXT:    negb %cl
-; FALLBACK24-NEXT:    movsbl %cl, %edi
-; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    andb $12, %al
+; FALLBACK24-NEXT:    negb %al
+; FALLBACK24-NEXT:    movsbl %al, %edi
+; FALLBACK24-NEXT:    movl 28(%esp,%edi), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    movb %ch, %dl
 ; FALLBACK24-NEXT:    notb %dl
-; FALLBACK24-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movl 24(%esp,%edi), %ebp
 ; FALLBACK24-NEXT:    movl %ebp, %esi
 ; FALLBACK24-NEXT:    shrl %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %dl, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    movl 32(%esp,%edi), %ecx
-; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 36(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    movl 20(%esp,%edi), %eax
+; FALLBACK24-NEXT:    movl %eax, %edi
 ; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %dl, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    orl %ebp, %edi
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    movl %ebx, %ebp
 ; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %dl, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    movl %eax, (%edx)
-; FALLBACK24-NEXT:    movl %ebp, 4(%edx)
-; FALLBACK24-NEXT:    movl %edi, 8(%edx)
-; FALLBACK24-NEXT:    movl %esi, 12(%edx)
-; FALLBACK24-NEXT:    addl $60, %esp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %ebx
+; FALLBACK24-NEXT:    movl %ebx, (%eax)
+; FALLBACK24-NEXT:    movl %ebp, 4(%eax)
+; FALLBACK24-NEXT:    movl %edi, 8(%eax)
+; FALLBACK24-NEXT:    movl %esi, 12(%eax)
+; FALLBACK24-NEXT:    addl $44, %esp
 ; FALLBACK24-NEXT:    popl %esi
 ; FALLBACK24-NEXT:    popl %edi
 ; FALLBACK24-NEXT:    popl %ebx
@@ -2261,56 +2244,54 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    pushl %ebx
 ; FALLBACK28-NEXT:    pushl %edi
 ; FALLBACK28-NEXT:    pushl %esi
-; FALLBACK28-NEXT:    subl $60, %esp
+; FALLBACK28-NEXT:    subl $44, %esp
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK28-NEXT:    movzbl (%eax), %ecx
-; FALLBACK28-NEXT:    movl %ecx, %eax
-; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    movzbl (%eax), %eax
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    shlb $3, %ch
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm1, (%esp)
 ; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    andb $12, %cl
-; FALLBACK28-NEXT:    negb %cl
-; FALLBACK28-NEXT:    movsbl %cl, %edi
-; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    andb $12, %al
+; FALLBACK28-NEXT:    negb %al
+; FALLBACK28-NEXT:    movsbl %al, %edi
+; FALLBACK28-NEXT:    movl 28(%esp,%edi), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    movb %ch, %dl
 ; FALLBACK28-NEXT:    notb %dl
-; FALLBACK28-NEXT:    movl 40(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movl 24(%esp,%edi), %ebp
 ; FALLBACK28-NEXT:    movl %ebp, %esi
 ; FALLBACK28-NEXT:    shrl %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %dl, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    movl 32(%esp,%edi), %ecx
-; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 36(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    movl 16(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    movl 20(%esp,%edi), %eax
+; FALLBACK28-NEXT:    movl %eax, %edi
 ; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %dl, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    orl %ebp, %edi
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    movl %ebx, %ebp
 ; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %dl, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    movl %eax, (%edx)
-; FALLBACK28-NEXT:    movl %ebp, 4(%edx)
-; FALLBACK28-NEXT:    movl %edi, 8(%edx)
-; FALLBACK28-NEXT:    movl %esi, 12(%edx)
-; FALLBACK28-NEXT:    addl $60, %esp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %ebx
+; FALLBACK28-NEXT:    movl %ebx, (%eax)
+; FALLBACK28-NEXT:    movl %ebp, 4(%eax)
+; FALLBACK28-NEXT:    movl %edi, 8(%eax)
+; FALLBACK28-NEXT:    movl %esi, 12(%eax)
+; FALLBACK28-NEXT:    addl $44, %esp
 ; FALLBACK28-NEXT:    popl %esi
 ; FALLBACK28-NEXT:    popl %edi
 ; FALLBACK28-NEXT:    popl %ebx
@@ -2709,6 +2690,7 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movb (%eax), %ah
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ah, %al
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -2719,38 +2701,38 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    andb $12, %ah
-; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %ah, %ebp
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 20(%esp,%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movzbl %ah, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 20(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ch, %dl
 ; X86-NO-SHLD-NO-BMI2-NEXT:    notb %dl
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 24(%esp,%ebp), %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 24(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ebp,%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 16(%esp,%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 16(%esp,%ebx), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %eax
 ; X86-NO-SHLD-NO-BMI2-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %dl, %cl
 ; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl 28(%esp,%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl 28(%esp,%ebx), %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-NEXT:    orl %ebp, %ebx
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, 12(%edx)
-; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-NEXT:    sarl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %eax, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-NEXT:    movl %ebx, 8(%edx)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %esi, (%edx)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    movl %edi, 4(%edx)
 ; X86-NO-SHLD-NO-BMI2-NEXT:    addl $60, %esp
@@ -3689,120 +3671,114 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %edi
 ; FALLBACK16-NEXT:    pushl %esi
 ; FALLBACK16-NEXT:    subl $108, %esp
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK16-NEXT:    movl (%ebp), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%ebp), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 8(%ebp), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 12(%ebp), %edi
-; FALLBACK16-NEXT:    movl 16(%ebp), %ebx
-; FALLBACK16-NEXT:    movb (%eax), %ah
-; FALLBACK16-NEXT:    movl 20(%ebp), %esi
-; FALLBACK16-NEXT:    movl 24(%ebp), %ecx
+; FALLBACK16-NEXT:    movl (%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ebp), %edx
+; FALLBACK16-NEXT:    movl 12(%ebp), %esi
+; FALLBACK16-NEXT:    movl 16(%ebp), %edi
+; FALLBACK16-NEXT:    movzbl (%ebx), %ebx
+; FALLBACK16-NEXT:    movl 20(%ebp), %ecx
+; FALLBACK16-NEXT:    movl 24(%ebp), %eax
 ; FALLBACK16-NEXT:    movl 28(%ebp), %ebp
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movb %ah, %dh
-; FALLBACK16-NEXT:    shlb $3, %dh
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, %eax
+; FALLBACK16-NEXT:    shlb $3, %al
+; FALLBACK16-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    andb $28, %ah
-; FALLBACK16-NEXT:    movzbl %ah, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 32(%esp,%edi), %esi
-; FALLBACK16-NEXT:    movl 36(%esp,%edi), %eax
-; FALLBACK16-NEXT:    movl %eax, %ebx
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movb %dh, %dl
-; FALLBACK16-NEXT:    notb %dl
-; FALLBACK16-NEXT:    movl 40(%esp,%edi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %ebp
-; FALLBACK16-NEXT:    movl %edx, %ecx
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %ebx, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    movl %eax, %ebx
-; FALLBACK16-NEXT:    addl %eax, %ebx
-; FALLBACK16-NEXT:    movl %edx, %ecx
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %esi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl 44(%esp,%eax), %ebp
-; FALLBACK16-NEXT:    movl %ebp, %esi
-; FALLBACK16-NEXT:    movb %dh, %cl
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %bl
+; FALLBACK16-NEXT:    movzbl %bl, %ebx
+; FALLBACK16-NEXT:    movl 32(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movl 36(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %esi
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    movl 48(%esp,%eax), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK16-NEXT:    movl %edx, %ecx
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %esi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %ebp, %ebp
-; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %edi, %ebp
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl 52(%esp,%eax), %edi
-; FALLBACK16-NEXT:    movl %edi, %ebx
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl 56(%esp,%eax), %esi
-; FALLBACK16-NEXT:    leal (%esi,%esi), %eax
-; FALLBACK16-NEXT:    movl %edx, %ecx
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebx, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
 ; FALLBACK16-NEXT:    addl %edi, %edi
-; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %ebx, %edi
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 60(%esp,%ecx), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %esi
-; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK16-NEXT:    shll %cl, %esi
 ; FALLBACK16-NEXT:    orl %eax, %esi
-; FALLBACK16-NEXT:    movb %dh, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK16-NEXT:    movl %esi, 24(%eax)
-; FALLBACK16-NEXT:    movl %edi, 16(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 8(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, (%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edx, %ebp
+; FALLBACK16-NEXT:    movl 52(%esp,%ebx), %edx
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK16-NEXT:    movl %ebx, 24(%ecx)
+; FALLBACK16-NEXT:    movl %edx, 16(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 20(%ecx)
+; FALLBACK16-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, (%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK16-NEXT:    addl $108, %esp
 ; FALLBACK16-NEXT:    popl %esi
 ; FALLBACK16-NEXT:    popl %edi
@@ -3898,22 +3874,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    subl $108, %esp
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl (%eax), %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl (%eax), %edx
 ; FALLBACK18-NEXT:    movl 4(%eax), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 8(%eax), %esi
 ; FALLBACK18-NEXT:    movl 12(%eax), %edi
 ; FALLBACK18-NEXT:    movl 16(%eax), %ebp
-; FALLBACK18-NEXT:    movzbl (%ebx), %ebx
-; FALLBACK18-NEXT:    movl 20(%eax), %edx
-; FALLBACK18-NEXT:    movl 24(%eax), %ecx
+; FALLBACK18-NEXT:    movzbl (%ebx), %ecx
+; FALLBACK18-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK18-NEXT:    movl 20(%eax), %ecx
+; FALLBACK18-NEXT:    movl 24(%eax), %ebx
 ; FALLBACK18-NEXT:    movl 28(%eax), %eax
 ; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
 ; FALLBACK18-NEXT:    movl %ebx, %eax
 ; FALLBACK18-NEXT:    shlb $3, %al
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -3922,24 +3899,23 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    andb $28, %bl
-; FALLBACK18-NEXT:    movzbl %bl, %edi
+; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, %ecx
+; FALLBACK18-NEXT:    andb $28, %cl
+; FALLBACK18-NEXT:    movzbl %cl, %edi
 ; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
 ; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %eax, %esi, %edx
-; FALLBACK18-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %eax, %esi, %ebx
 ; FALLBACK18-NEXT:    movl %eax, %edx
-; FALLBACK18-NEXT:    movl %eax, %ebx
+; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    notb %dl
-; FALLBACK18-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK18-NEXT:    shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %ebx, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl %ebx, %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, 32(%esp,%edi), %ebx
+; FALLBACK18-NEXT:    shrxl %ecx, 32(%esp,%edi), %ebx
 ; FALLBACK18-NEXT:    addl %esi, %esi
 ; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    orl %ebx, %eax
@@ -4012,16 +3988,16 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK19-NEXT:    movl 8(%ecx), %esi
 ; FALLBACK19-NEXT:    movl 12(%ecx), %edi
 ; FALLBACK19-NEXT:    movl 16(%ecx), %ebp
-; FALLBACK19-NEXT:    movzbl (%ebx), %ebx
-; FALLBACK19-NEXT:    movl 20(%ecx), %edx
-; FALLBACK19-NEXT:    movl 24(%ecx), %eax
+; FALLBACK19-NEXT:    movzbl (%ebx), %edx
+; FALLBACK19-NEXT:    movl 20(%ecx), %eax
+; FALLBACK19-NEXT:    movl 24(%ecx), %ebx
 ; FALLBACK19-NEXT:    movl 28(%ecx), %ecx
 ; FALLBACK19-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT:    movl %ebx, %ecx
+; FALLBACK19-NEXT:    movl %edx, %ecx
 ; FALLBACK19-NEXT:    shlb $3, %cl
 ; FALLBACK19-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
@@ -4031,8 +4007,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT:    andb $28, %bl
-; FALLBACK19-NEXT:    movzbl %bl, %ebp
+; FALLBACK19-NEXT:    andb $28, %dl
+; FALLBACK19-NEXT:    movzbl %dl, %ebp
 ; FALLBACK19-NEXT:    movl 24(%esp,%ebp), %esi
 ; FALLBACK19-NEXT:    movl 20(%esp,%ebp), %eax
 ; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4087,96 +4063,96 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK20-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT:    movzbl (%eax), %ecx
-; FALLBACK20-NEXT:    movl %ecx, %eax
-; FALLBACK20-NEXT:    shlb $3, %al
+; FALLBACK20-NEXT:    movzbl (%eax), %eax
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shlb $3, %cl
 ; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    andb $28, %cl
-; FALLBACK20-NEXT:    movzbl %cl, %edi
-; FALLBACK20-NEXT:    movl 32(%esp,%edi), %esi
-; FALLBACK20-NEXT:    movl 36(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %esi
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    notb %dl
-; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %esi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    andb $28, %al
+; FALLBACK20-NEXT:    movzbl %al, %edi
+; FALLBACK20-NEXT:    movl 32(%esp,%edi), %eax
+; FALLBACK20-NEXT:    movl 36(%esp,%edi), %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movb %cl, %bh
+; FALLBACK20-NEXT:    movb %cl, %bl
+; FALLBACK20-NEXT:    notb %bh
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %bh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK20-NEXT:    movl %ebp, %esi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl %ebp, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
 ; FALLBACK20-NEXT:    movl 48(%esp,%edi), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %esi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edx
+; FALLBACK20-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    movb %bh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, %ebx
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shrl %cl, %eax
 ; FALLBACK20-NEXT:    addl %ebp, %ebp
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %bh, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
-; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
-; FALLBACK20-NEXT:    movl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl 52(%esp,%edi), %edx
+; FALLBACK20-NEXT:    movl %edx, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movb %al, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
-; FALLBACK20-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movl 60(%esp,%edi), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edi
 ; FALLBACK20-NEXT:    orl %ebx, %edi
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebp, %ebp
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %edi, %ebp
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl 60(%esp,%ecx), %ebx
-; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edi
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
 ; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK20-NEXT:    movl %esi, 4(%eax)
-; FALLBACK20-NEXT:    movl %edi, 24(%eax)
-; FALLBACK20-NEXT:    movl %ebp, 16(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, (%eax)
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK20-NEXT:    movl %edi, 24(%ecx)
+; FALLBACK20-NEXT:    movl %edx, 16(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK20-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, (%ecx)
 ; FALLBACK20-NEXT:    addl $108, %esp
 ; FALLBACK20-NEXT:    popl %esi
 ; FALLBACK20-NEXT:    popl %edi
@@ -4262,8 +4238,8 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK22-NEXT:    movzbl (%eax), %ecx
-; FALLBACK22-NEXT:    movl %ecx, %edx
-; FALLBACK22-NEXT:    shlb $3, %dl
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shlb $3, %al
 ; FALLBACK22-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -4271,52 +4247,47 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    andb $28, %cl
 ; FALLBACK22-NEXT:    movzbl %cl, %edi
-; FALLBACK22-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %edx, %eax
-; FALLBACK22-NEXT:    notb %al
+; FALLBACK22-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    movl %eax, %edx
+; FALLBACK22-NEXT:    notb %dl
 ; FALLBACK22-NEXT:    movl 36(%esp,%edi), %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK22-NEXT:    orl %ecx, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 48(%esp,%edi), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT:    movl %eax, %ebp
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
 ; FALLBACK22-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
 ; FALLBACK22-NEXT:    orl %ebx, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    addl %ecx, %ecx
-; FALLBACK22-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT:    movl 40(%esp,%edi), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
 ; FALLBACK22-NEXT:    orl %ebx, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK22-NEXT:    movl %ebp, %ecx
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %eax, %ebx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %ebx, %ebx
-; FALLBACK22-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
 ; FALLBACK22-NEXT:    orl %ebp, %ebx
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebp
+; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK22-NEXT:    shrxl %eax, %edi, %eax
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    movl %ecx, %edx
-; FALLBACK22-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %ebp, %edi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    addl %ecx, %ecx
 ; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
 ; FALLBACK22-NEXT:    orl %esi, %ecx
@@ -4415,94 +4386,94 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT:    movzbl (%eax), %ecx
-; FALLBACK24-NEXT:    movl %ecx, %eax
-; FALLBACK24-NEXT:    shlb $3, %al
+; FALLBACK24-NEXT:    movzbl (%eax), %eax
+; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    shlb $3, %ch
 ; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    andb $28, %cl
-; FALLBACK24-NEXT:    movzbl %cl, %edi
-; FALLBACK24-NEXT:    movl 32(%esp,%edi), %esi
-; FALLBACK24-NEXT:    movl 36(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %esi
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    notb %dl
-; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %esi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    andb $28, %al
+; FALLBACK24-NEXT:    movzbl %al, %edi
+; FALLBACK24-NEXT:    movl 32(%esp,%edi), %eax
+; FALLBACK24-NEXT:    movl 36(%esp,%edi), %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movb %ch, %bl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    notb %bl
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK24-NEXT:    movl %ebp, %esi
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %esi
-; FALLBACK24-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %esi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 48(%esp,%edi), %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, %ebx
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
 ; FALLBACK24-NEXT:    addl %ebp, %ebp
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %bl, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
-; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
-; FALLBACK24-NEXT:    movl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
-; FALLBACK24-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movl 60(%esp,%edi), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %edi
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %edi
 ; FALLBACK24-NEXT:    orl %ebx, %edi
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebp, %ebp
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %edi, %ebp
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl 60(%esp,%ecx), %ebx
-; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edi
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
 ; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK24-NEXT:    movl %esi, 4(%eax)
-; FALLBACK24-NEXT:    movl %edi, 24(%eax)
-; FALLBACK24-NEXT:    movl %ebp, 16(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, (%eax)
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK24-NEXT:    movl %edi, 24(%ecx)
+; FALLBACK24-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK24-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, (%ecx)
 ; FALLBACK24-NEXT:    addl $108, %esp
 ; FALLBACK24-NEXT:    popl %esi
 ; FALLBACK24-NEXT:    popl %edi
@@ -4586,59 +4557,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK26-NEXT:    vmovups (%ecx), %ymm0
 ; FALLBACK26-NEXT:    movzbl (%eax), %ecx
-; FALLBACK26-NEXT:    movl %ecx, %edx
-; FALLBACK26-NEXT:    shlb $3, %dl
+; FALLBACK26-NEXT:    movl %ecx, %eax
+; FALLBACK26-NEXT:    shlb $3, %al
 ; FALLBACK26-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    andb $28, %cl
 ; FALLBACK26-NEXT:    movzbl %cl, %edi
-; FALLBACK26-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %edx, %eax
-; FALLBACK26-NEXT:    notb %al
+; FALLBACK26-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    movl %eax, %edx
+; FALLBACK26-NEXT:    notb %dl
 ; FALLBACK26-NEXT:    movl 36(%esp,%edi), %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    addl %esi, %esi
-; FALLBACK26-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    orl %ecx, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 48(%esp,%edi), %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT:    movl %eax, %ebp
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
 ; FALLBACK26-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
 ; FALLBACK26-NEXT:    orl %ebx, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    addl %ecx, %ecx
-; FALLBACK26-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT:    movl 40(%esp,%edi), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
 ; FALLBACK26-NEXT:    orl %ebx, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK26-NEXT:    movl %ebp, %ecx
+; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %eax, %ebx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %ebx, %ebx
-; FALLBACK26-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
 ; FALLBACK26-NEXT:    orl %ebp, %ebx
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebp
+; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK26-NEXT:    shrxl %eax, %edi, %eax
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    movl %ecx, %edx
-; FALLBACK26-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %ebp, %edi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK26-NEXT:    addl %ecx, %ecx
 ; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
 ; FALLBACK26-NEXT:    orl %esi, %ecx
@@ -4736,94 +4702,94 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT:    movzbl (%eax), %ecx
-; FALLBACK28-NEXT:    movl %ecx, %eax
-; FALLBACK28-NEXT:    shlb $3, %al
+; FALLBACK28-NEXT:    movzbl (%eax), %eax
+; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    shlb $3, %ch
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    andb $28, %cl
-; FALLBACK28-NEXT:    movzbl %cl, %edi
-; FALLBACK28-NEXT:    movl 32(%esp,%edi), %esi
-; FALLBACK28-NEXT:    movl 36(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %esi
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    notb %dl
-; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %esi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    andb $28, %al
+; FALLBACK28-NEXT:    movzbl %al, %edi
+; FALLBACK28-NEXT:    movl 32(%esp,%edi), %eax
+; FALLBACK28-NEXT:    movl 36(%esp,%edi), %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movb %ch, %bl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    notb %bl
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebp
-; FALLBACK28-NEXT:    movl %ebp, %esi
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %esi
-; FALLBACK28-NEXT:    movl 48(%esp,%edi), %ecx
-; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %esi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 48(%esp,%edi), %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, %ebx
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
 ; FALLBACK28-NEXT:    addl %ebp, %ebp
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %bl, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
-; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
-; FALLBACK28-NEXT:    movl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl 52(%esp,%edi), %eax
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
-; FALLBACK28-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movl 60(%esp,%edi), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %edi
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %edi
 ; FALLBACK28-NEXT:    orl %ebx, %edi
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebp, %ebp
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %edi, %ebp
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl 60(%esp,%ecx), %ebx
-; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edi
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    orl (%esp), %edi # 4-byte Folded Reload
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
 ; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT:    movl %ebx, 28(%eax)
-; FALLBACK28-NEXT:    movl %esi, 4(%eax)
-; FALLBACK28-NEXT:    movl %edi, 24(%eax)
-; FALLBACK28-NEXT:    movl %ebp, 16(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, (%eax)
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 4(%ecx)
+; FALLBACK28-NEXT:    movl %edi, 24(%ecx)
+; FALLBACK28-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK28-NEXT:    movl %ebp, 8(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, (%ecx)
 ; FALLBACK28-NEXT:    addl $108, %esp
 ; FALLBACK28-NEXT:    popl %esi
 ; FALLBACK28-NEXT:    popl %edi
@@ -4907,59 +4873,54 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK30-NEXT:    vmovups (%ecx), %ymm0
 ; FALLBACK30-NEXT:    movzbl (%eax), %ecx
-; FALLBACK30-NEXT:    movl %ecx, %edx
-; FALLBACK30-NEXT:    shlb $3, %dl
+; FALLBACK30-NEXT:    movl %ecx, %eax
+; FALLBACK30-NEXT:    shlb $3, %al
 ; FALLBACK30-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK30-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    andb $28, %cl
 ; FALLBACK30-NEXT:    movzbl %cl, %edi
-; FALLBACK30-NEXT:    shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %edx, %eax
-; FALLBACK30-NEXT:    notb %al
-; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %eax, 32(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    movl %eax, %edx
+; FALLBACK30-NEXT:    notb %dl
+; FALLBACK30-NEXT:    movl 36(%esp,%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    addl %esi, %esi
-; FALLBACK30-NEXT:    shlxl %eax, %esi, %esi
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    orl %ecx, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 48(%esp,%edi), %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT:    movl %eax, %ebp
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
 ; FALLBACK30-NEXT:    movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    shrxl %edx, %ecx, %ebx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
 ; FALLBACK30-NEXT:    orl %ebx, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    addl %ecx, %ecx
-; FALLBACK30-NEXT:    shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT:    movl 40(%esp,%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %eax, %ebx
+; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
+; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
 ; FALLBACK30-NEXT:    orl %ebx, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT:    shlxl %ebp, %ebx, %eax
-; FALLBACK30-NEXT:    movl %ebp, %ecx
+; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT:    shrxl %edx, %ebx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %eax, %ebx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %ebx, %ebx
-; FALLBACK30-NEXT:    shlxl %ecx, %ebx, %ebx
+; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
 ; FALLBACK30-NEXT:    orl %ebp, %ebx
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebp
+; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT:    shrxl %edx, %edi, %eax
+; FALLBACK30-NEXT:    shrxl %eax, %edi, %eax
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    movl %ecx, %edx
-; FALLBACK30-NEXT:    shlxl %ecx, %edi, %edi
+; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %ebp, %edi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK30-NEXT:    addl %ecx, %ecx
 ; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
 ; FALLBACK30-NEXT:    orl %esi, %ecx
@@ -6513,23 +6474,23 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %edi
 ; FALLBACK16-NEXT:    pushl %esi
 ; FALLBACK16-NEXT:    subl $108, %esp
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT:    movl (%ecx), %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%ecx), %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 8(%ecx), %esi
-; FALLBACK16-NEXT:    movl 12(%ecx), %edi
-; FALLBACK16-NEXT:    movl 16(%ecx), %ebx
-; FALLBACK16-NEXT:    movb (%eax), %ah
-; FALLBACK16-NEXT:    movl 20(%ecx), %ebp
-; FALLBACK16-NEXT:    movl 24(%ecx), %edx
-; FALLBACK16-NEXT:    movl 28(%ecx), %ecx
+; FALLBACK16-NEXT:    movl (%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %esi
+; FALLBACK16-NEXT:    movl 12(%eax), %edi
+; FALLBACK16-NEXT:    movl 16(%eax), %ebx
+; FALLBACK16-NEXT:    movzbl (%edx), %edx
+; FALLBACK16-NEXT:    movl 20(%eax), %ebp
+; FALLBACK16-NEXT:    movl 24(%eax), %ecx
+; FALLBACK16-NEXT:    movl 28(%eax), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movb %ah, %ch
-; FALLBACK16-NEXT:    shlb $3, %ch
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shlb $3, %cl
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -6537,87 +6498,90 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    andb $28, %ah
-; FALLBACK16-NEXT:    negb %ah
-; FALLBACK16-NEXT:    movsbl %ah, %ebx
-; FALLBACK16-NEXT:    movl 64(%esp,%ebx), %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 68(%esp,%ebx), %eax
-; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %dl
+; FALLBACK16-NEXT:    negb %dl
+; FALLBACK16-NEXT:    movsbl %dl, %ebp
+; FALLBACK16-NEXT:    movl 64(%esp,%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movb %cl, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shrl %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movb %ch, %dl
-; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%ebp), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %edi
 ; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %esi, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    movl 72(%esp,%ebx), %esi
-; FALLBACK16-NEXT:    movl %esi, %ebp
-; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %edi, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    shrl %eax
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    orl %esi, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 84(%esp,%ebx), %esi
-; FALLBACK16-NEXT:    movl %esi, %eax
-; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    movl 80(%esp,%ebx), %edi
-; FALLBACK16-NEXT:    movl %edi, %ebp
-; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %eax
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    orl %edi, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 92(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    movl 88(%esp,%ebx), %edi
-; FALLBACK16-NEXT:    movl %edi, %ebx
 ; FALLBACK16-NEXT:    shrl %ebx
-; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK16-NEXT:    shrl %cl, %ebx
 ; FALLBACK16-NEXT:    orl %eax, %ebx
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%ebp), %ebx
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %esi
 ; FALLBACK16-NEXT:    shrl %esi
-; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    orl %ebx, %esi
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    shrl %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %ebx
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl %edx, (%eax)
-; FALLBACK16-NEXT:    movl %esi, 24(%eax)
-; FALLBACK16-NEXT:    movl %ebx, 28(%eax)
+; FALLBACK16-NEXT:    movl %ebx, (%eax)
+; FALLBACK16-NEXT:    movl %edx, 24(%eax)
+; FALLBACK16-NEXT:    movl %esi, 28(%eax)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 20(%eax)
+; FALLBACK16-NEXT:    movl %edi, 20(%eax)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6733,71 +6697,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ebx, %edx
-; FALLBACK18-NEXT:    shlb $3, %dl
+; FALLBACK18-NEXT:    movl %ebx, %eax
+; FALLBACK18-NEXT:    shlb $3, %al
 ; FALLBACK18-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    andb $28, %bl
 ; FALLBACK18-NEXT:    negb %bl
 ; FALLBACK18-NEXT:    movsbl %bl, %esi
 ; FALLBACK18-NEXT:    movl 64(%esp,%esi), %ebx
 ; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 68(%esp,%esi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %eax, %edi
-; FALLBACK18-NEXT:    movl %edx, %ecx
-; FALLBACK18-NEXT:    notb %cl
+; FALLBACK18-NEXT:    movl 68(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %edi
+; FALLBACK18-NEXT:    movl %eax, %edx
+; FALLBACK18-NEXT:    notb %dl
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %ebx
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %ebx
 ; FALLBACK18-NEXT:    orl %edi, %ebx
 ; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 72(%esp,%esi), %ebx
 ; FALLBACK18-NEXT:    movl %ebx, %edi
 ; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %ecx, %edi, %eax
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 76(%esp,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebx
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT:    shrl %eax
-; FALLBACK18-NEXT:    shrxl %ecx, %eax, %eax
-; FALLBACK18-NEXT:    orl %ebx, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %eax, %edi, %ebp
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %eax, %ebx, %ebx
+; FALLBACK18-NEXT:    shrl %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %ecx, %ecx
+; FALLBACK18-NEXT:    orl %ebx, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 80(%esp,%esi), %ecx
+; FALLBACK18-NEXT:    movl %ecx, %ebx
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %eax
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %ebx
+; FALLBACK18-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK18-NEXT:    shlxl %edx, %ebx, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %eax, %ebx, %ebp
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %eax, %ecx, %ecx
 ; FALLBACK18-NEXT:    shrl %edi
-; FALLBACK18-NEXT:    shrxl %ecx, %edi, %edi
-; FALLBACK18-NEXT:    orl %eax, %edi
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, 92(%esp,%esi), %ebp
+; FALLBACK18-NEXT:    shrxl %edx, %edi, %edi
+; FALLBACK18-NEXT:    orl %ecx, %edi
+; FALLBACK18-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    shlxl %eax, 92(%esp,%esi), %ebp
 ; FALLBACK18-NEXT:    movl 88(%esp,%esi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %eax, %esi, %eax
 ; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ecx, %esi, %esi
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    orl %ebp, %esi
 ; FALLBACK18-NEXT:    shrl %ebx
-; FALLBACK18-NEXT:    shrxl %ecx, %ebx, %edx
+; FALLBACK18-NEXT:    shrxl %edx, %ebx, %edx
 ; FALLBACK18-NEXT:    orl %eax, %edx
 ; FALLBACK18-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, (%eax)
 ; FALLBACK18-NEXT:    movl %edx, 24(%eax)
 ; FALLBACK18-NEXT:    movl %esi, 28(%eax)
@@ -6909,100 +6869,96 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK20-NEXT:    movups (%ecx), %xmm0
 ; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT:    movzbl (%eax), %ecx
-; FALLBACK20-NEXT:    movb %cl, %dh
-; FALLBACK20-NEXT:    shlb $3, %dh
+; FALLBACK20-NEXT:    movzbl (%eax), %eax
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shlb $3, %cl
 ; FALLBACK20-NEXT:    xorps %xmm2, %xmm2
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    andb $28, %cl
-; FALLBACK20-NEXT:    negb %cl
-; FALLBACK20-NEXT:    movsbl %cl, %ebx
-; FALLBACK20-NEXT:    movl 84(%esp,%ebx), %edi
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    movb %dh, %dl
-; FALLBACK20-NEXT:    notb %dl
-; FALLBACK20-NEXT:    movl 80(%esp,%ebx), %esi
-; FALLBACK20-NEXT:    movl %esi, %eax
-; FALLBACK20-NEXT:    shrl %eax
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    orl %edi, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    movl %ebx, %edi
-; FALLBACK20-NEXT:    movl 76(%esp,%ebx), %ebp
-; FALLBACK20-NEXT:    movl %ebp, %eax
-; FALLBACK20-NEXT:    shrl %eax
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    orl %esi, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    movl 72(%esp,%ebx), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %eax
-; FALLBACK20-NEXT:    shrl %eax
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 68(%esp,%edi), %ebp
-; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    andb $28, %al
+; FALLBACK20-NEXT:    negb %al
+; FALLBACK20-NEXT:    movsbl %al, %ebp
+; FALLBACK20-NEXT:    movl 84(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movb %cl, %ch
+; FALLBACK20-NEXT:    movl %ecx, %ebx
+; FALLBACK20-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    notb %ch
+; FALLBACK20-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %esi
 ; FALLBACK20-NEXT:    shrl %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    movl 64(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    movl 76(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %ebx
 ; FALLBACK20-NEXT:    shrl %ebx
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl 88(%esp,%edi), %ebp
-; FALLBACK20-NEXT:    movl %ebp, %edi
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    orl %edx, %ebx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    movl 68(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    shrl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
 ; FALLBACK20-NEXT:    shrl %eax
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    orl %edi, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movl 92(%esp,%eax), %edi
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    orl %edi, %ebp
-; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    orl %edx, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT:    movl %edx, (%eax)
-; FALLBACK20-NEXT:    movl %ebp, 28(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK20-NEXT:    movl %ebx, 4(%eax)
-; FALLBACK20-NEXT:    movl %esi, 8(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movl %edx, (%ecx)
+; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK20-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 8(%ecx)
+; FALLBACK20-NEXT:    movl %ebx, 12(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK20-NEXT:    addl $108, %esp
 ; FALLBACK20-NEXT:    popl %esi
 ; FALLBACK20-NEXT:    popl %edi
@@ -7238,98 +7194,94 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT:    movzbl (%eax), %ecx
-; FALLBACK24-NEXT:    movb %cl, %dh
-; FALLBACK24-NEXT:    shlb $3, %dh
+; FALLBACK24-NEXT:    movzbl (%eax), %eax
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shlb $3, %cl
 ; FALLBACK24-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    andb $28, %cl
-; FALLBACK24-NEXT:    negb %cl
-; FALLBACK24-NEXT:    movsbl %cl, %ebx
-; FALLBACK24-NEXT:    movl 84(%esp,%ebx), %edi
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    movb %dh, %dl
-; FALLBACK24-NEXT:    notb %dl
-; FALLBACK24-NEXT:    movl 80(%esp,%ebx), %esi
-; FALLBACK24-NEXT:    movl %esi, %eax
-; FALLBACK24-NEXT:    shrl %eax
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    orl %edi, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    movl %ebx, %edi
-; FALLBACK24-NEXT:    movl 76(%esp,%ebx), %ebp
-; FALLBACK24-NEXT:    movl %ebp, %eax
-; FALLBACK24-NEXT:    shrl %eax
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    orl %esi, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    movl 72(%esp,%ebx), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %eax
-; FALLBACK24-NEXT:    shrl %eax
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 68(%esp,%edi), %ebp
-; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    andb $28, %al
+; FALLBACK24-NEXT:    negb %al
+; FALLBACK24-NEXT:    movsbl %al, %ebp
+; FALLBACK24-NEXT:    movl 84(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movb %cl, %ch
+; FALLBACK24-NEXT:    movl %ecx, %ebx
+; FALLBACK24-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    notb %ch
+; FALLBACK24-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    movl 76(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, %esi
 ; FALLBACK24-NEXT:    shrl %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK24-NEXT:    shrl %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    movl 64(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %ebx
 ; FALLBACK24-NEXT:    shrl %ebx
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl 88(%esp,%edi), %ebp
-; FALLBACK24-NEXT:    movl %ebp, %edi
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    orl %edx, %ebx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    movl 68(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    shrl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
 ; FALLBACK24-NEXT:    shrl %eax
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    orl %edi, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movl 92(%esp,%eax), %edi
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    orl %edi, %ebp
-; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    orl %edx, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT:    movl %edx, (%eax)
-; FALLBACK24-NEXT:    movl %ebp, 28(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK24-NEXT:    movl %ebx, 4(%eax)
-; FALLBACK24-NEXT:    movl %esi, 8(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    movl %edx, (%ecx)
+; FALLBACK24-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK24-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 8(%ecx)
+; FALLBACK24-NEXT:    movl %ebx, 12(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK24-NEXT:    addl $108, %esp
 ; FALLBACK24-NEXT:    popl %esi
 ; FALLBACK24-NEXT:    popl %edi
@@ -7560,98 +7512,94 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT:    movzbl (%eax), %ecx
-; FALLBACK28-NEXT:    movb %cl, %dh
-; FALLBACK28-NEXT:    shlb $3, %dh
+; FALLBACK28-NEXT:    movzbl (%eax), %eax
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shlb $3, %cl
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK28-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    andb $28, %cl
-; FALLBACK28-NEXT:    negb %cl
-; FALLBACK28-NEXT:    movsbl %cl, %ebx
-; FALLBACK28-NEXT:    movl 84(%esp,%ebx), %edi
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    movb %dh, %dl
-; FALLBACK28-NEXT:    notb %dl
-; FALLBACK28-NEXT:    movl 80(%esp,%ebx), %esi
-; FALLBACK28-NEXT:    movl %esi, %eax
-; FALLBACK28-NEXT:    shrl %eax
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    orl %edi, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    movl %ebx, %edi
-; FALLBACK28-NEXT:    movl 76(%esp,%ebx), %ebp
-; FALLBACK28-NEXT:    movl %ebp, %eax
-; FALLBACK28-NEXT:    shrl %eax
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    orl %esi, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    movl 72(%esp,%ebx), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %eax
-; FALLBACK28-NEXT:    shrl %eax
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 68(%esp,%edi), %ebp
-; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    andb $28, %al
+; FALLBACK28-NEXT:    negb %al
+; FALLBACK28-NEXT:    movsbl %al, %ebp
+; FALLBACK28-NEXT:    movl 84(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movb %cl, %ch
+; FALLBACK28-NEXT:    movl %ecx, %ebx
+; FALLBACK28-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    notb %ch
+; FALLBACK28-NEXT:    movl 80(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %esi
 ; FALLBACK28-NEXT:    shrl %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    movl 64(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    movl 76(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    movl %edx, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %ebx
 ; FALLBACK28-NEXT:    shrl %ebx
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl 88(%esp,%edi), %ebp
-; FALLBACK28-NEXT:    movl %ebp, %edi
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    orl %edx, %ebx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    movl 68(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    movl %edx, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 64(%esp,%ebp), %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    shrl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
 ; FALLBACK28-NEXT:    shrl %eax
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    orl %edi, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movl 92(%esp,%eax), %edi
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    orl %edi, %ebp
-; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    orl %edx, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT:    movl %edx, (%eax)
-; FALLBACK28-NEXT:    movl %ebp, 28(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK28-NEXT:    movl %ebx, 4(%eax)
-; FALLBACK28-NEXT:    movl %esi, 8(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    movl %edx, (%ecx)
+; FALLBACK28-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK28-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 8(%ecx)
+; FALLBACK28-NEXT:    movl %ebx, 12(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
 ; FALLBACK28-NEXT:    addl $108, %esp
 ; FALLBACK28-NEXT:    popl %esi
 ; FALLBACK28-NEXT:    popl %edi
@@ -9454,114 +9402,110 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %esi
 ; FALLBACK16-NEXT:    subl $108, %esp
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; FALLBACK16-NEXT:    movl (%esi), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; FALLBACK16-NEXT:    movl (%edx), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%esi), %ecx
+; FALLBACK16-NEXT:    movl 4(%edx), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 8(%esi), %ebx
-; FALLBACK16-NEXT:    movl 12(%esi), %ebp
-; FALLBACK16-NEXT:    movl 16(%esi), %edi
-; FALLBACK16-NEXT:    movzbl (%eax), %ecx
-; FALLBACK16-NEXT:    movl 20(%esi), %edx
-; FALLBACK16-NEXT:    movl 24(%esi), %eax
-; FALLBACK16-NEXT:    movl 28(%esi), %esi
+; FALLBACK16-NEXT:    movl 8(%edx), %edi
+; FALLBACK16-NEXT:    movl 12(%edx), %esi
+; FALLBACK16-NEXT:    movl 16(%edx), %ebp
+; FALLBACK16-NEXT:    movzbl (%eax), %ebx
+; FALLBACK16-NEXT:    movl 20(%edx), %eax
+; FALLBACK16-NEXT:    movl 24(%edx), %ecx
+; FALLBACK16-NEXT:    movl 28(%edx), %edx
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shlb $3, %cl
 ; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, %edx
-; FALLBACK16-NEXT:    shlb $3, %dl
+; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    sarl $31, %esi
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    andb $28, %cl
-; FALLBACK16-NEXT:    movzbl %cl, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 32(%esp,%edi), %esi
-; FALLBACK16-NEXT:    movl 36(%esp,%edi), %eax
-; FALLBACK16-NEXT:    movl %eax, %ebx
-; FALLBACK16-NEXT:    movl %edx, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movb %dl, %ch
+; FALLBACK16-NEXT:    sarl $31, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    andb $28, %bl
+; FALLBACK16-NEXT:    movzbl %bl, %ebx
+; FALLBACK16-NEXT:    movl 32(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movl 36(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %esi
+; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    movb %cl, %ch
 ; FALLBACK16-NEXT:    notb %ch
-; FALLBACK16-NEXT:    movl 40(%esp,%edi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK16-NEXT:    movl 40(%esp,%ebx), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    addl %eax, %eax
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %esi, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl 44(%esp,%eax), %ebp
-; FALLBACK16-NEXT:    movl %ebp, %esi
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    movl %edx, %ebx
-; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    movl 48(%esp,%eax), %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %esi, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebx, %edx
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 48(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
 ; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %edi, %ebp
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK16-NEXT:    movl 52(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movl %edi, %eax
-; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    orl %edx, %ebp
+; FALLBACK16-NEXT:    movl 52(%esp,%ebx), %edx
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 56(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %esi
+; FALLBACK16-NEXT:    movl 56(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %esi
 ; FALLBACK16-NEXT:    orl %eax, %esi
-; FALLBACK16-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    addl %edi, %edi
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %eax, %edi
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl 60(%esp,%eax), %eax
-; FALLBACK16-NEXT:    leal (%eax,%eax), %edx
+; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    addl %edx, %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl 60(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %edi, %ebx
 ; FALLBACK16-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; FALLBACK16-NEXT:    sarl %cl, %eax
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
-; FALLBACK16-NEXT:    movl %edx, 24(%ecx)
-; FALLBACK16-NEXT:    movl %edi, 16(%ecx)
+; FALLBACK16-NEXT:    movl %ebx, 24(%ecx)
+; FALLBACK16-NEXT:    movl %edx, 16(%ecx)
 ; FALLBACK16-NEXT:    movl %esi, 20(%ecx)
 ; FALLBACK16-NEXT:    movl %ebp, 8(%ecx)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -9584,33 +9528,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    pushl %edi
 ; FALLBACK17-NEXT:    pushl %esi
 ; FALLBACK17-NEXT:    subl $92, %esp
-; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT:    movl (%ecx), %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 4(%ecx), %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 8(%ecx), %edx
-; FALLBACK17-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 12(%ecx), %ebp
-; FALLBACK17-NEXT:    movl 16(%ecx), %ebx
-; FALLBACK17-NEXT:    movzbl (%eax), %eax
-; FALLBACK17-NEXT:    movl 20(%ecx), %edi
-; FALLBACK17-NEXT:    movl 24(%ecx), %edx
-; FALLBACK17-NEXT:    movl 28(%ecx), %esi
-; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl %eax, %ecx
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; FALLBACK17-NEXT:    movl (%esi), %eax
+; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 4(%esi), %eax
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 8(%esi), %edi
+; FALLBACK17-NEXT:    movl 12(%esi), %ebp
+; FALLBACK17-NEXT:    movl 16(%esi), %edx
+; FALLBACK17-NEXT:    movzbl (%ecx), %ebx
+; FALLBACK17-NEXT:    movl 20(%esi), %ecx
+; FALLBACK17-NEXT:    movl 24(%esi), %eax
+; FALLBACK17-NEXT:    movl 28(%esi), %esi
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %ebx, %ecx
 ; FALLBACK17-NEXT:    shlb $3, %cl
 ; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
-; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    sarl $31, %esi
 ; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -9620,18 +9562,18 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    andb $28, %al
-; FALLBACK17-NEXT:    movzbl %al, %ebp
+; FALLBACK17-NEXT:    andb $28, %bl
+; FALLBACK17-NEXT:    movzbl %bl, %ebp
 ; FALLBACK17-NEXT:    movl 24(%esp,%ebp), %edx
 ; FALLBACK17-NEXT:    movl 20(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK17-NEXT:    movl 32(%esp,%ebp), %ebx
 ; FALLBACK17-NEXT:    movl 28(%esp,%ebp), %eax
 ; FALLBACK17-NEXT:    movl %eax, %esi
 ; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
-; FALLBACK17-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
 ; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK17-NEXT:    movl 40(%esp,%ebp), %edx
@@ -9644,7 +9586,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
 ; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; FALLBACK17-NEXT:    movl %edx, 24(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl (%esp), %edx # 4-byte Reload
 ; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
 ; FALLBACK17-NEXT:    sarl %cl, %eax
 ; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
@@ -9652,7 +9594,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    movl %edi, 20(%ebp)
 ; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
 ; FALLBACK17-NEXT:    movl %esi, (%ebp)
 ; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -9677,25 +9619,25 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 4(%esi), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 8(%esi), %ebx
+; FALLBACK18-NEXT:    movl 8(%esi), %edi
 ; FALLBACK18-NEXT:    movl 12(%esi), %ebp
-; FALLBACK18-NEXT:    movl 16(%esi), %edi
-; FALLBACK18-NEXT:    movzbl (%ecx), %ecx
-; FALLBACK18-NEXT:    movl 20(%esi), %edx
-; FALLBACK18-NEXT:    movl 24(%esi), %eax
+; FALLBACK18-NEXT:    movl 16(%esi), %eax
+; FALLBACK18-NEXT:    movzbl (%ecx), %ebx
+; FALLBACK18-NEXT:    movl 20(%esi), %ecx
+; FALLBACK18-NEXT:    movl 24(%esi), %edx
 ; FALLBACK18-NEXT:    movl 28(%esi), %esi
-; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ecx, %eax
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %ebx, %eax
 ; FALLBACK18-NEXT:    shlb $3, %al
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    sarl $31, %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
@@ -9705,8 +9647,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    andb $28, %cl
-; FALLBACK18-NEXT:    movzbl %cl, %edi
+; FALLBACK18-NEXT:    andb $28, %bl
+; FALLBACK18-NEXT:    movzbl %bl, %edi
 ; FALLBACK18-NEXT:    movl 36(%esp,%edi), %esi
 ; FALLBACK18-NEXT:    movl 40(%esp,%edi), %ecx
 ; FALLBACK18-NEXT:    shrxl %eax, %esi, %ebx
@@ -9779,29 +9721,29 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK19-NEXT:    pushl %edi
 ; FALLBACK19-NEXT:    pushl %esi
 ; FALLBACK19-NEXT:    subl $92, %esp
-; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; FALLBACK19-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT:    movl (%ecx), %edx
-; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT:    movl 4(%ecx), %edx
-; FALLBACK19-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT:    movl 8(%ecx), %edx
-; FALLBACK19-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK19-NEXT:    movl (%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 4(%ecx), %eax
+; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK19-NEXT:    movl 8(%ecx), %edi
 ; FALLBACK19-NEXT:    movl 12(%ecx), %ebp
-; FALLBACK19-NEXT:    movl 16(%ecx), %ebx
-; FALLBACK19-NEXT:    movzbl (%eax), %eax
-; FALLBACK19-NEXT:    movl 20(%ecx), %edi
-; FALLBACK19-NEXT:    movl 24(%ecx), %edx
+; FALLBACK19-NEXT:    movl 16(%ecx), %eax
+; FALLBACK19-NEXT:    movzbl (%edx), %edx
+; FALLBACK19-NEXT:    movb %dl, (%esp) # 1-byte Spill
+; FALLBACK19-NEXT:    movl 20(%ecx), %edx
+; FALLBACK19-NEXT:    movl 24(%ecx), %ebx
 ; FALLBACK19-NEXT:    movl 28(%ecx), %esi
-; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movzbl (%esp), %eax # 1-byte Folded Reload
 ; FALLBACK19-NEXT:    movl %eax, %ecx
 ; FALLBACK19-NEXT:    shlb $3, %cl
 ; FALLBACK19-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT:    movl (%esp), %edx # 4-byte Reload
-; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK19-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK19-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -9870,105 +9812,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK20-NEXT:    movups (%ecx), %xmm0
-; FALLBACK20-NEXT:    movl 16(%ecx), %esi
-; FALLBACK20-NEXT:    movl 20(%ecx), %edi
-; FALLBACK20-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK20-NEXT:    movl 28(%ecx), %edx
+; FALLBACK20-NEXT:    movl 16(%ecx), %edx
+; FALLBACK20-NEXT:    movl 20(%ecx), %esi
+; FALLBACK20-NEXT:    movl 24(%ecx), %edi
+; FALLBACK20-NEXT:    movl 28(%ecx), %ecx
 ; FALLBACK20-NEXT:    movzbl (%eax), %eax
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shlb $3, %cl
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, %ebx
+; FALLBACK20-NEXT:    shlb $3, %bl
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    sarl $31, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    sarl $31, %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    andb $28, %al
-; FALLBACK20-NEXT:    movzbl %al, %edi
-; FALLBACK20-NEXT:    movl 32(%esp,%edi), %eax
-; FALLBACK20-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl %ecx, %edx
-; FALLBACK20-NEXT:    movb %cl, %dh
-; FALLBACK20-NEXT:    notb %dl
-; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl %eax, %esi
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 44(%esp,%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %eax
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl 48(%esp,%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl %eax, %esi
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 40(%esp,%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, %eax
-; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movzbl %al, %esi
+; FALLBACK20-NEXT:    movl 32(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %eax, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK20-NEXT:    movb %bl, %bh
+; FALLBACK20-NEXT:    notb %bh
+; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %bh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%esp,%esi), %ebp
 ; FALLBACK20-NEXT:    movl %ebp, %eax
-; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK20-NEXT:    movl 48(%esp,%esi), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %eax, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edx
+; FALLBACK20-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    movb %bh, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 40(%esp,%esi), %edi
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK20-NEXT:    shrl %cl, %eax
 ; FALLBACK20-NEXT:    addl %ebp, %ebp
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    movb %bh, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebp
 ; FALLBACK20-NEXT:    orl %eax, %ebp
-; FALLBACK20-NEXT:    movb %dh, %cl
+; FALLBACK20-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 56(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl 60(%esp,%esi), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl 60(%esp,%edi), %eax
-; FALLBACK20-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK20-NEXT:    shll %cl, %edi
 ; FALLBACK20-NEXT:    orl %ebx, %edi
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movl %edx, %ecx
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movb %dh, %cl
-; FALLBACK20-NEXT:    sarl %cl, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    sarl %cl, %edx
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
-; FALLBACK20-NEXT:    movl %esi, 4(%ecx)
-; FALLBACK20-NEXT:    movl %edi, 24(%ecx)
-; FALLBACK20-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK20-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK20-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK20-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK20-NEXT:    movl %ebp, 8(%ecx)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10111,31 +10053,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    addl %ecx, %ecx
 ; FALLBACK22-NEXT:    shlxl %edx, %ecx, %esi
 ; FALLBACK22-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT:    movl %eax, %ecx
 ; FALLBACK22-NEXT:    orl %ebx, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 56(%esp,%edi), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT:    shlxl %edx, %ebx, %eax
+; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK22-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %eax, %ebx, %ebp
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %ebx, %ebx
 ; FALLBACK22-NEXT:    shlxl %edx, %ebx, %ebx
 ; FALLBACK22-NEXT:    orl %ebp, %ebx
-; FALLBACK22-NEXT:    shrxl %ecx, %esi, %ecx
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebp
 ; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    movl 60(%esp,%edi), %edi
 ; FALLBACK22-NEXT:    sarxl %eax, %edi, %eax
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK22-NEXT:    orl %ecx, %edi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    orl %ebp, %edi
 ; FALLBACK22-NEXT:    addl %ecx, %ecx
 ; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ecx
 ; FALLBACK22-NEXT:    orl %esi, %ecx
@@ -10246,105 +10184,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK24-NEXT:    movl 16(%ecx), %esi
-; FALLBACK24-NEXT:    movl 20(%ecx), %edi
-; FALLBACK24-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK24-NEXT:    movl 28(%ecx), %edx
+; FALLBACK24-NEXT:    movl 16(%ecx), %edx
+; FALLBACK24-NEXT:    movl 20(%ecx), %esi
+; FALLBACK24-NEXT:    movl 24(%ecx), %edi
+; FALLBACK24-NEXT:    movl 28(%ecx), %ecx
 ; FALLBACK24-NEXT:    movzbl (%eax), %eax
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shlb $3, %cl
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, %ebx
+; FALLBACK24-NEXT:    shlb $3, %bl
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    sarl $31, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    sarl $31, %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    andb $28, %al
-; FALLBACK24-NEXT:    movzbl %al, %edi
-; FALLBACK24-NEXT:    movl 32(%esp,%edi), %eax
-; FALLBACK24-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl %ecx, %edx
-; FALLBACK24-NEXT:    movb %cl, %dh
-; FALLBACK24-NEXT:    notb %dl
-; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl %eax, %esi
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 44(%esp,%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %eax
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl 48(%esp,%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl %eax, %esi
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 40(%esp,%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, %eax
-; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movzbl %al, %esi
+; FALLBACK24-NEXT:    movl 32(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %eax, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK24-NEXT:    movb %bl, %bh
+; FALLBACK24-NEXT:    notb %bh
+; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %bh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%esp,%esi), %ebp
 ; FALLBACK24-NEXT:    movl %ebp, %eax
-; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK24-NEXT:    movl 48(%esp,%esi), %ecx
 ; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %eax, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edx
+; FALLBACK24-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    movb %bh, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 40(%esp,%esi), %edi
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK24-NEXT:    shrl %cl, %eax
 ; FALLBACK24-NEXT:    addl %ebp, %ebp
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    movb %bh, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebp
 ; FALLBACK24-NEXT:    orl %eax, %ebp
-; FALLBACK24-NEXT:    movb %dh, %cl
+; FALLBACK24-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 56(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl 60(%esp,%esi), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl 60(%esp,%edi), %eax
-; FALLBACK24-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK24-NEXT:    shll %cl, %edi
 ; FALLBACK24-NEXT:    orl %ebx, %edi
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movl %edx, %ecx
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movb %dh, %cl
-; FALLBACK24-NEXT:    sarl %cl, %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    sarl %cl, %edx
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT:    movl %eax, 28(%ecx)
-; FALLBACK24-NEXT:    movl %esi, 4(%ecx)
-; FALLBACK24-NEXT:    movl %edi, 24(%ecx)
-; FALLBACK24-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK24-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK24-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK24-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK24-NEXT:    movl %ebp, 8(%ecx)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10487,31 +10425,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    addl %ecx, %ecx
 ; FALLBACK26-NEXT:    shlxl %edx, %ecx, %esi
 ; FALLBACK26-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT:    movl %eax, %ecx
 ; FALLBACK26-NEXT:    orl %ebx, %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 56(%esp,%edi), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT:    shlxl %edx, %ebx, %eax
+; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK26-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl %ecx, %eax
-; FALLBACK26-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK26-NEXT:    shrxl %eax, %ebx, %ebp
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    addl %ebx, %ebx
 ; FALLBACK26-NEXT:    shlxl %edx, %ebx, %ebx
 ; FALLBACK26-NEXT:    orl %ebp, %ebx
-; FALLBACK26-NEXT:    shrxl %ecx, %esi, %ecx
+; FALLBACK26-NEXT:    shrxl %eax, %esi, %ebp
 ; FALLBACK26-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; FALLBACK26-NEXT:    movl 60(%esp,%edi), %edi
 ; FALLBACK26-NEXT:    sarxl %eax, %edi, %eax
 ; FALLBACK26-NEXT:    addl %edi, %edi
 ; FALLBACK26-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK26-NEXT:    orl %ecx, %edi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK26-NEXT:    orl %ebp, %edi
 ; FALLBACK26-NEXT:    addl %ecx, %ecx
 ; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ecx
 ; FALLBACK26-NEXT:    orl %esi, %ecx
@@ -10622,105 +10556,105 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %xmm0
-; FALLBACK28-NEXT:    movl 16(%ecx), %esi
-; FALLBACK28-NEXT:    movl 20(%ecx), %edi
-; FALLBACK28-NEXT:    movl 24(%ecx), %ebx
-; FALLBACK28-NEXT:    movl 28(%ecx), %edx
+; FALLBACK28-NEXT:    movl 16(%ecx), %edx
+; FALLBACK28-NEXT:    movl 20(%ecx), %esi
+; FALLBACK28-NEXT:    movl 24(%ecx), %edi
+; FALLBACK28-NEXT:    movl 28(%ecx), %ecx
 ; FALLBACK28-NEXT:    movzbl (%eax), %eax
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shlb $3, %cl
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, %ebx
+; FALLBACK28-NEXT:    shlb $3, %bl
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    sarl $31, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    sarl $31, %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    andb $28, %al
-; FALLBACK28-NEXT:    movzbl %al, %edi
-; FALLBACK28-NEXT:    movl 32(%esp,%edi), %eax
-; FALLBACK28-NEXT:    movl 36(%esp,%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl %ecx, %edx
-; FALLBACK28-NEXT:    movb %cl, %dh
-; FALLBACK28-NEXT:    notb %dl
-; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl %eax, %esi
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 44(%esp,%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %eax
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl 48(%esp,%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl %eax, %esi
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 40(%esp,%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, %eax
-; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movzbl %al, %esi
+; FALLBACK28-NEXT:    movl 32(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl 36(%esp,%esi), %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %eax, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 52(%esp,%edi), %ebp
+; FALLBACK28-NEXT:    movb %bl, %bh
+; FALLBACK28-NEXT:    notb %bh
+; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %bh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%esp,%esi), %ebp
 ; FALLBACK28-NEXT:    movl %ebp, %eax
-; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl 56(%esp,%edi), %ecx
+; FALLBACK28-NEXT:    movl 48(%esp,%esi), %ecx
 ; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %eax, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edx
+; FALLBACK28-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    movb %bh, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 40(%esp,%esi), %edi
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK28-NEXT:    shrl %cl, %eax
 ; FALLBACK28-NEXT:    addl %ebp, %ebp
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    movb %bh, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebp
 ; FALLBACK28-NEXT:    orl %eax, %ebp
-; FALLBACK28-NEXT:    movb %dh, %cl
+; FALLBACK28-NEXT:    movl 52(%esp,%esi), %eax
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 56(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl 60(%esp,%esi), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl 60(%esp,%edi), %eax
-; FALLBACK28-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; FALLBACK28-NEXT:    shll %cl, %edi
 ; FALLBACK28-NEXT:    orl %ebx, %edi
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movl %edx, %ecx
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movb %dh, %cl
-; FALLBACK28-NEXT:    sarl %cl, %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    sarl %cl, %edx
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT:    movl %eax, 28(%ecx)
-; FALLBACK28-NEXT:    movl %esi, 4(%ecx)
-; FALLBACK28-NEXT:    movl %edi, 24(%ecx)
-; FALLBACK28-NEXT:    movl %ebp, 16(%ecx)
+; FALLBACK28-NEXT:    movl %edx, 28(%ecx)
+; FALLBACK28-NEXT:    movl %edi, 4(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 24(%ecx)
+; FALLBACK28-NEXT:    movl %eax, 16(%ecx)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK28-NEXT:    movl %ebp, 8(%ecx)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -10863,31 +10797,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    addl %ecx, %ecx
 ; FALLBACK30-NEXT:    shlxl %edx, %ecx, %esi
 ; FALLBACK30-NEXT:    movl 40(%esp,%edi), %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT:    movl %eax, %ecx
 ; FALLBACK30-NEXT:    orl %ebx, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 56(%esp,%edi), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT:    shlxl %edx, %ebx, %eax
+; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
+; FALLBACK30-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT:    shrxl %ecx, %ebx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl %ecx, %eax
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK30-NEXT:    shrxl %eax, %ebx, %ebp
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    addl %ebx, %ebx
 ; FALLBACK30-NEXT:    shlxl %edx, %ebx, %ebx
 ; FALLBACK30-NEXT:    orl %ebp, %ebx
-; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ecx
+; FALLBACK30-NEXT:    shrxl %eax, %esi, %ebp
 ; FALLBACK30-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; FALLBACK30-NEXT:    movl 60(%esp,%edi), %edi
 ; FALLBACK30-NEXT:    sarxl %eax, %edi, %eax
 ; FALLBACK30-NEXT:    addl %edi, %edi
 ; FALLBACK30-NEXT:    shlxl %edx, %edi, %edi
-; FALLBACK30-NEXT:    orl %ecx, %edi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK30-NEXT:    orl %ebp, %edi
 ; FALLBACK30-NEXT:    addl %ecx, %ecx
 ; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ecx
 ; FALLBACK30-NEXT:    orl %esi, %ecx
@@ -13188,242 +13118,235 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %edi
 ; FALLBACK16-NEXT:    pushl %esi
 ; FALLBACK16-NEXT:    subl $204, %esp
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl (%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 8(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 12(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 16(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 20(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 24(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 28(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 32(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 36(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 40(%eax), %ebp
-; FALLBACK16-NEXT:    movl 44(%eax), %ebx
-; FALLBACK16-NEXT:    movl 48(%eax), %edi
-; FALLBACK16-NEXT:    movl 52(%eax), %esi
-; FALLBACK16-NEXT:    movl 56(%eax), %edx
-; FALLBACK16-NEXT:    movl 60(%eax), %ecx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK16-NEXT:    movl (%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%ebx), %ebp
+; FALLBACK16-NEXT:    movl 44(%ebx), %edi
+; FALLBACK16-NEXT:    movl 48(%ebx), %esi
+; FALLBACK16-NEXT:    movl 52(%ebx), %edx
+; FALLBACK16-NEXT:    movl 56(%ebx), %ecx
+; FALLBACK16-NEXT:    movl 60(%ebx), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK16-NEXT:    movl (%ebx), %ebx
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %eax, %esi
-; FALLBACK16-NEXT:    andl $60, %esi
-; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK16-NEXT:    shll $3, %eax
-; FALLBACK16-NEXT:    andl $24, %eax
-; FALLBACK16-NEXT:    movl %edx, %edi
-; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, %ebp
+; FALLBACK16-NEXT:    andl $60, %ebp
+; FALLBACK16-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    shll $3, %ebx
+; FALLBACK16-NEXT:    andl $24, %ebx
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK16-NEXT:    movb %bl, %ch
+; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%ebp), %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl 80(%esp,%ebp), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK16-NEXT:    movb %al, %ch
-; FALLBACK16-NEXT:    notb %ch
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %edi, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK16-NEXT:    movl %edx, %ebp
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %ebp, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    movl 96(%esp,%ebp), %edx
+; FALLBACK16-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    movl %eax, %edx
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK16-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 100(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    addl %esi, %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 108(%esp,%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 112(%esp,%ebp), %eax
 ; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    addl %eax, %eax
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
 ; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebp, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    addl %ebx, %ebx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %eax, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebp, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %ebx, %ebx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movl %edi, %ebp
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl 120(%esp,%ebp), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %ebp, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
 ; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %edx, %edi
-; FALLBACK16-NEXT:    movl %esi, %edx
-; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK16-NEXT:    movl %esi, %ebx
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    movl 124(%esp,%ebp), %eax
 ; FALLBACK16-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %ebx, %ebp
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    addl %esi, %esi
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    orl %ebx, %esi
-; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %eax, %edx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK16-NEXT:    movl %edx, 56(%eax)
-; FALLBACK16-NEXT:    movl %esi, 48(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK16-NEXT:    movl %edi, 40(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, (%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %eax, 60(%ecx)
+; FALLBACK16-NEXT:    movl %ebp, 56(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK16-NEXT:    movl %edx, 52(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 40(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, (%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK16-NEXT:    addl $204, %esp
 ; FALLBACK16-NEXT:    popl %esi
 ; FALLBACK16-NEXT:    popl %edi
@@ -13657,16 +13580,17 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK18-NEXT:    andl $24, %edx
-; FALLBACK18-NEXT:    andl $60, %ecx
-; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    andl $60, %eax
+; FALLBACK18-NEXT:    movl 68(%esp,%eax), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK18-NEXT:    movl %edx, %ebx
 ; FALLBACK18-NEXT:    notb %bl
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
 ; FALLBACK18-NEXT:    orl %edi, %eax
@@ -13689,61 +13613,59 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    orl %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK18-NEXT:    addl %edi, %edi
 ; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    orl %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl %ecx, %edi
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %edi
+; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %ebp, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %ecx
 ; FALLBACK18-NEXT:    addl %esi, %esi
 ; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK18-NEXT:    orl %ecx, %esi
-; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl 120(%esp,%eax), %ebp
 ; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
 ; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT:    movl 116(%esp,%eax), %eax
 ; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
 ; FALLBACK18-NEXT:    orl %edi, %ecx
 ; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -13963,7 +13885,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    movl (%eax), %ebx
 ; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -13973,160 +13895,155 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    andl $60, %ebp
+; FALLBACK20-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    shll $3, %ebx
+; FALLBACK20-NEXT:    andl $24, %ebx
 ; FALLBACK20-NEXT:    movl %eax, %esi
-; FALLBACK20-NEXT:    andl $60, %esi
-; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK20-NEXT:    shll $3, %eax
-; FALLBACK20-NEXT:    andl $24, %eax
-; FALLBACK20-NEXT:    movl %edx, %edi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
-; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl 72(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %edi
+; FALLBACK20-NEXT:    movb %bl, %ch
 ; FALLBACK20-NEXT:    notb %ch
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    addl %eax, %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %edi, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK20-NEXT:    movl %edx, %ebp
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 80(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    leal (%esi,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %eax, %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %edx, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 88(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
 ; FALLBACK20-NEXT:    addl %eax, %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    leal (%esi,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %eax, %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    orl %edx, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %eax, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    addl %eax, %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movl %edi, %ebp
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    movl 112(%esp,%ebp), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %edi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 120(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
 ; FALLBACK20-NEXT:    addl %edi, %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    orl %edx, %edi
-; FALLBACK20-NEXT:    movl %esi, %edx
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK20-NEXT:    movl %esi, %ebx
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    orl %eax, %edi
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    movl %edx, %eax
 ; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK20-NEXT:    leal (%ebp,%ebp), %edx
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edx
 ; FALLBACK20-NEXT:    orl %eax, %edx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 60(%eax)
 ; FALLBACK20-NEXT:    movl %edx, 56(%eax)
-; FALLBACK20-NEXT:    movl %esi, 48(%eax)
-; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK20-NEXT:    movl %edi, 40(%eax)
+; FALLBACK20-NEXT:    movl %edi, 48(%eax)
+; FALLBACK20-NEXT:    movl %esi, 52(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14284,7 +14201,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK22-NEXT:    movl (%eax), %ecx
+; FALLBACK22-NEXT:    movl (%eax), %eax
 ; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -14294,88 +14211,84 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
+; FALLBACK22-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK22-NEXT:    andl $24, %edx
-; FALLBACK22-NEXT:    andl $60, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    andl $60, %eax
+; FALLBACK22-NEXT:    movl 68(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%eax), %ecx
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
 ; FALLBACK22-NEXT:    orl %edi, %ebp
 ; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%eax), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
 ; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    orl %edi, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 80(%esp,%eax), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 76(%esp,%eax), %edi
 ; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ecx
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    orl %ecx, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    movl 88(%esp,%eax), %ecx
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 84(%esp,%eax), %edi
 ; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 96(%esp,%eax), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 92(%esp,%eax), %edi
 ; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ecx
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    orl %ecx, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    movl 104(%esp,%eax), %ecx
+; FALLBACK22-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 100(%esp,%eax), %edi
 ; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 112(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %edi
 ; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ebp, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %ecx, %ecx
 ; FALLBACK22-NEXT:    addl %esi, %esi
 ; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    orl %ecx, %esi
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
 ; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
 ; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
@@ -14553,167 +14466,161 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
 ; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
-; FALLBACK24-NEXT:    movl (%eax), %ecx
+; FALLBACK24-NEXT:    movl (%eax), %ebx
 ; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, %esi
-; FALLBACK24-NEXT:    andl $60, %esi
-; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK24-NEXT:    shll $3, %ecx
-; FALLBACK24-NEXT:    andl $24, %ecx
-; FALLBACK24-NEXT:    movl %edx, %edi
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    movl 72(%esp,%esi), %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK24-NEXT:    movl %ecx, %ebp
-; FALLBACK24-NEXT:    movb %cl, %ch
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    andl $60, %ebp
+; FALLBACK24-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    shll $3, %ebx
+; FALLBACK24-NEXT:    andl $24, %ebx
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl 72(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %edi
+; FALLBACK24-NEXT:    movb %bl, %ch
 ; FALLBACK24-NEXT:    notb %ch
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movl %ebp, %eax
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %edi, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK24-NEXT:    movl %edx, %ebp
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 80(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    leal (%esi,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 88(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
 ; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    leal (%esi,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edx
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %eax, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movl %edi, %ebp
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    movl 112(%esp,%ebp), %ecx
 ; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    addl %edi, %edi
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    orl %edx, %edi
-; FALLBACK24-NEXT:    movl %esi, %edx
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK24-NEXT:    movl %esi, %ebx
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %edi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 120(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %eax, %edi
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    movl %edx, %eax
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK24-NEXT:    leal (%ebp,%ebp), %edx
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %edx
 ; FALLBACK24-NEXT:    orl %eax, %edx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 60(%eax)
 ; FALLBACK24-NEXT:    movl %edx, 56(%eax)
-; FALLBACK24-NEXT:    movl %esi, 48(%eax)
-; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK24-NEXT:    movl %edi, 40(%eax)
+; FALLBACK24-NEXT:    movl %edi, 48(%eax)
+; FALLBACK24-NEXT:    movl %esi, 52(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14876,7 +14783,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    notb %bl
@@ -14891,63 +14797,61 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 80(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 76(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK26-NEXT:    addl %edi, %edi
 ; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK26-NEXT:    addl %edi, %edi
 ; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %edi
 ; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK26-NEXT:    addl %esi, %esi
 ; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    orl %eax, %esi
@@ -15124,165 +15028,159 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
-; FALLBACK28-NEXT:    movl (%eax), %ecx
+; FALLBACK28-NEXT:    movl (%eax), %ebx
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, %esi
-; FALLBACK28-NEXT:    andl $60, %esi
-; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK28-NEXT:    shll $3, %ecx
-; FALLBACK28-NEXT:    andl $24, %ecx
-; FALLBACK28-NEXT:    movl %edx, %edi
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    movl 72(%esp,%esi), %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK28-NEXT:    movl %ecx, %ebp
-; FALLBACK28-NEXT:    movb %cl, %ch
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    andl $60, %ebp
+; FALLBACK28-NEXT:    movl 68(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    shll $3, %ebx
+; FALLBACK28-NEXT:    andl $24, %ebx
+; FALLBACK28-NEXT:    movl %eax, %esi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl 72(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %edi
+; FALLBACK28-NEXT:    movb %bl, %ch
 ; FALLBACK28-NEXT:    notb %ch
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movl %ebp, %eax
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %edi, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK28-NEXT:    movl %edx, %ebp
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 80(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    leal (%esi,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 88(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
 ; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl 92(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl 96(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    leal (%esi,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl 100(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edx
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %eax, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movl %edi, %ebp
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl 108(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %esi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    movl 112(%esp,%ebp), %ecx
 ; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    addl %edi, %edi
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    orl %edx, %edi
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK28-NEXT:    movl %esi, %ebx
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %edi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 120(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %eax, %edi
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    movl %edx, %eax
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 124(%esp,%ebp), %ebp
+; FALLBACK28-NEXT:    leal (%ebp,%ebp), %edx
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %edx
 ; FALLBACK28-NEXT:    orl %eax, %edx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 60(%eax)
 ; FALLBACK28-NEXT:    movl %edx, 56(%eax)
-; FALLBACK28-NEXT:    movl %esi, 48(%eax)
-; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK28-NEXT:    movl %edi, 40(%eax)
+; FALLBACK28-NEXT:    movl %edi, 48(%eax)
+; FALLBACK28-NEXT:    movl %esi, 52(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -15439,7 +15337,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    andl $60, %edx
 ; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
 ; FALLBACK30-NEXT:    movl %ecx, %ebx
 ; FALLBACK30-NEXT:    notb %bl
@@ -15454,63 +15351,61 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 80(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 76(%esp,%edx), %edi
 ; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %eax, %eax
 ; FALLBACK30-NEXT:    addl %edi, %edi
 ; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 88(%esp,%edx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 84(%esp,%edx), %edi
 ; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 96(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 92(%esp,%edx), %edi
 ; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %eax, %eax
 ; FALLBACK30-NEXT:    addl %edi, %edi
 ; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 104(%esp,%edx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 100(%esp,%edx), %edi
 ; FALLBACK30-NEXT:    shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK30-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 112(%esp,%edx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 112(%esp,%edx), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %edi
 ; FALLBACK30-NEXT:    movl 108(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    shrxl %ecx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %ecx, %eax, %eax
 ; FALLBACK30-NEXT:    addl %esi, %esi
 ; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    orl %eax, %esi
@@ -17200,254 +17095,255 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %edi
 ; FALLBACK16-NEXT:    pushl %esi
 ; FALLBACK16-NEXT:    subl $204, %esp
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl (%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 8(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 12(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 16(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 20(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 24(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 28(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 32(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 36(%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 40(%eax), %ebp
-; FALLBACK16-NEXT:    movl 44(%eax), %ebx
-; FALLBACK16-NEXT:    movl 48(%eax), %edi
-; FALLBACK16-NEXT:    movl 52(%eax), %esi
-; FALLBACK16-NEXT:    movl 56(%eax), %edx
-; FALLBACK16-NEXT:    movl 60(%eax), %ecx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK16-NEXT:    movl (%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%ebx), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%ebx), %ebp
+; FALLBACK16-NEXT:    movl 44(%ebx), %edi
+; FALLBACK16-NEXT:    movl 48(%ebx), %esi
+; FALLBACK16-NEXT:    movl 52(%ebx), %edx
+; FALLBACK16-NEXT:    movl 56(%ebx), %ecx
+; FALLBACK16-NEXT:    movl 60(%ebx), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; FALLBACK16-NEXT:    movl (%ebx), %ebx
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %eax, %edx
-; FALLBACK16-NEXT:    andl $60, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT:    subl %edx, %ecx
-; FALLBACK16-NEXT:    movl (%ecx), %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%ecx), %edx
-; FALLBACK16-NEXT:    movl %ecx, %ebp
-; FALLBACK16-NEXT:    shll $3, %eax
-; FALLBACK16-NEXT:    andl $24, %eax
-; FALLBACK16-NEXT:    movl %edx, %esi
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %al, %ch
-; FALLBACK16-NEXT:    notb %ch
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %esi, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 12(%ebp), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 8(%ebp), %esi
-; FALLBACK16-NEXT:    movl %ebp, %edi
-; FALLBACK16-NEXT:    movl %esi, %ebp
-; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %ebx, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    movl %ebx, %eax
+; FALLBACK16-NEXT:    andl $60, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    subl %eax, %ebp
+; FALLBACK16-NEXT:    movl (%ebp), %edx
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %edi, %ebp
-; FALLBACK16-NEXT:    movl 20(%edi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 16(%edi), %esi
-; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl 4(%ebp), %edi
+; FALLBACK16-NEXT:    shll $3, %ecx
+; FALLBACK16-NEXT:    andl $24, %ecx
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    shll %cl, %eax
 ; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl %ecx, %ebx
+; FALLBACK16-NEXT:    movl %ecx, %esi
+; FALLBACK16-NEXT:    notb %bl
+; FALLBACK16-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movl 12(%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %esi, %ecx
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 8(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %ebx
+; FALLBACK16-NEXT:    shrl %ebx
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %esi, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %edx, %ebx
 ; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 24(%ebp), %esi
-; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
 ; FALLBACK16-NEXT:    shrl %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl 20(%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %ebp
+; FALLBACK16-NEXT:    movl 16(%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    shrl %eax
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %esi, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 36(%edx), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 32(%edx), %esi
-; FALLBACK16-NEXT:    movl %edx, %ebp
-; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movl 28(%ebp), %edi
+; FALLBACK16-NEXT:    movl %edi, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 24(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    shrl %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %esi, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 44(%ebp), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 40(%ebp), %esi
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl 36(%ebp), %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    movl 32(%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    movl 40(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %esi
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %eax
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %esi, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 52(%ebp), %esi
-; FALLBACK16-NEXT:    movl %esi, %edi
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl 52(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, %edi
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    shll %cl, %edi
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    negl %edx
-; FALLBACK16-NEXT:    movl 176(%esp,%edx), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %edi, %ebp
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %edx
+; FALLBACK16-NEXT:    movl 176(%esp,%edx), %edx
+; FALLBACK16-NEXT:    movl %edx, %esi
+; FALLBACK16-NEXT:    shrl %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK16-NEXT:    movl 60(%edi), %edx
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    movl 56(%edi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %edi
+; FALLBACK16-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
 ; FALLBACK16-NEXT:    orl %edx, %edi
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 60(%ebp), %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movl 56(%ebp), %edx
+; FALLBACK16-NEXT:    movl %edx, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    orl %ebx, %esi
-; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl %edx, (%eax)
-; FALLBACK16-NEXT:    movl %esi, 56(%eax)
-; FALLBACK16-NEXT:    movl %edi, 60(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 48(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %edx, (%ecx)
+; FALLBACK16-NEXT:    movl %eax, 56(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK16-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 48(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK16-NEXT:    addl $204, %esp
 ; FALLBACK16-NEXT:    popl %esi
 ; FALLBACK16-NEXT:    popl %edi
@@ -17690,7 +17586,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl (%edi), %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 4(%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl %edx, %ebx
 ; FALLBACK18-NEXT:    notb %bl
 ; FALLBACK18-NEXT:    shrl %ecx
@@ -17701,78 +17596,77 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl 8(%edi), %esi
 ; FALLBACK18-NEXT:    movl %esi, %ecx
 ; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 12(%edi), %ecx
 ; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    shrl %eax
 ; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
 ; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 16(%edi), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 16(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, %eax
 ; FALLBACK18-NEXT:    shrl %eax
 ; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    movl 20(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 20(%edi), %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    orl %eax, %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 24(%edi), %ecx
+; FALLBACK18-NEXT:    orl %esi, %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 24(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, %ecx
 ; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 28(%edi), %ecx
 ; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %eax, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 32(%edi), %eax
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 32(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, %eax
 ; FALLBACK18-NEXT:    shrl %eax
 ; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT:    movl 36(%edi), %esi
-; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 36(%edi), %eax
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    orl %eax, %ecx
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 40(%edi), %ecx
+; FALLBACK18-NEXT:    orl %esi, %ecx
 ; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 40(%edi), %esi
+; FALLBACK18-NEXT:    movl %esi, %ecx
 ; FALLBACK18-NEXT:    shrl %ecx
-; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 44(%edi), %ecx
 ; FALLBACK18-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK18-NEXT:    shrl %eax
+; FALLBACK18-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK18-NEXT:    orl %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    movl 48(%edi), %eax
+; FALLBACK18-NEXT:    movl %eax, %esi
 ; FALLBACK18-NEXT:    shrl %esi
 ; FALLBACK18-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT:    orl %eax, %esi
-; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 48(%edi), %esi
 ; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrl %esi
-; FALLBACK18-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK18-NEXT:    movl 52(%edi), %esi
 ; FALLBACK18-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shlxl %edx, %eax, %eax
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %ebp
 ; FALLBACK18-NEXT:    orl %eax, %ebp
@@ -17999,204 +17893,204 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK20-NEXT:    movl (%eax), %eax
-; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
-; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    andl $60, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT:    subl %edx, %ecx
-; FALLBACK20-NEXT:    movl (%ecx), %edi
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 4(%ecx), %edx
-; FALLBACK20-NEXT:    movl %ecx, %ebp
-; FALLBACK20-NEXT:    shll $3, %eax
-; FALLBACK20-NEXT:    andl $24, %eax
-; FALLBACK20-NEXT:    movl %edx, %esi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movb %al, %ch
-; FALLBACK20-NEXT:    notb %ch
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %esi, %edi
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 12(%ebp), %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 8(%ebp), %esi
-; FALLBACK20-NEXT:    movl %ebp, %edi
-; FALLBACK20-NEXT:    movl %esi, %ebp
-; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
-; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    movl (%eax), %ecx
+; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm3, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    andl $60, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; FALLBACK20-NEXT:    subl %eax, %ebp
+; FALLBACK20-NEXT:    movl (%ebp), %edx
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %edi, %ebp
-; FALLBACK20-NEXT:    movl 20(%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 16(%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movl 4(%ebp), %edi
+; FALLBACK20-NEXT:    shll $3, %ecx
+; FALLBACK20-NEXT:    andl $24, %ecx
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    shll %cl, %eax
 ; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %ecx, %ebx
+; FALLBACK20-NEXT:    movl %ecx, %esi
+; FALLBACK20-NEXT:    notb %bl
+; FALLBACK20-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movl 12(%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %esi, %ecx
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 8(%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %ebx
+; FALLBACK20-NEXT:    shrl %ebx
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %esi, %edi
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %ebp, %edx
-; FALLBACK20-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    orl %edx, %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 24(%ebp), %esi
-; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movl %esi, %ebx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
 ; FALLBACK20-NEXT:    shrl %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    orl %eax, %edi
 ; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl 20(%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
 ; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl 16(%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, %eax
+; FALLBACK20-NEXT:    shrl %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    orl %esi, %ebp
-; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 36(%edx), %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 32(%edx), %esi
-; FALLBACK20-NEXT:    movl %edx, %ebp
-; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
 ; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl 28(%ebp), %edi
+; FALLBACK20-NEXT:    movl %edi, %edx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 24(%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %edx
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 44(%ebp), %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 40(%ebp), %esi
-; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %esi, %edx
-; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movl 36(%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movl 32(%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, %eax
+; FALLBACK20-NEXT:    shrl %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 44(%ebp), %edx
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 40(%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %eax
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %edx
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %esi, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 52(%ebp), %esi
-; FALLBACK20-NEXT:    movl %esi, %edi
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl 52(%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, %edi
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    shll %cl, %edi
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK20-NEXT:    negl %edx
-; FALLBACK20-NEXT:    movl 176(%esp,%edx), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    orl %edi, %ebp
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %edx
+; FALLBACK20-NEXT:    movl 176(%esp,%edx), %edx
+; FALLBACK20-NEXT:    movl %edx, %esi
+; FALLBACK20-NEXT:    shrl %esi
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT:    movl 60(%edi), %edx
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %edi, %esi
+; FALLBACK20-NEXT:    movb %bl, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    movl 56(%edi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
 ; FALLBACK20-NEXT:    orl %edx, %edi
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 60(%ebp), %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movl 56(%ebp), %edx
+; FALLBACK20-NEXT:    movl %edx, %edi
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %edx, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT:    movl %edx, (%eax)
-; FALLBACK20-NEXT:    movl %esi, 56(%eax)
-; FALLBACK20-NEXT:    movl %edi, 60(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 48(%eax)
-; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movl %edx, (%ecx)
+; FALLBACK20-NEXT:    movl %eax, 56(%ecx)
+; FALLBACK20-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 48(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK20-NEXT:    addl $204, %esp
 ; FALLBACK20-NEXT:    popl %esi
 ; FALLBACK20-NEXT:    popl %edi
@@ -18351,7 +18245,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl (%edi), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 4(%edi), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    notb %bl
 ; FALLBACK22-NEXT:    shrl %ecx
@@ -18362,78 +18255,77 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl 8(%edi), %esi
 ; FALLBACK22-NEXT:    movl %esi, %ecx
 ; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 12(%edi), %ecx
 ; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK22-NEXT:    shrl %eax
 ; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 16(%edi), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 16(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, %eax
 ; FALLBACK22-NEXT:    shrl %eax
 ; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    movl 20(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 20(%edi), %eax
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %eax, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 24(%edi), %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 24(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, %ecx
 ; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 28(%edi), %ecx
 ; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %eax, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 32(%edi), %eax
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 32(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, %eax
 ; FALLBACK22-NEXT:    shrl %eax
 ; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT:    movl 36(%edi), %esi
-; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 36(%edi), %eax
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    orl %eax, %ecx
-; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 40(%edi), %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 40(%edi), %esi
+; FALLBACK22-NEXT:    movl %esi, %ecx
 ; FALLBACK22-NEXT:    shrl %ecx
-; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 44(%edi), %ecx
 ; FALLBACK22-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrl %eax
+; FALLBACK22-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl 48(%edi), %eax
+; FALLBACK22-NEXT:    movl %eax, %esi
 ; FALLBACK22-NEXT:    shrl %esi
 ; FALLBACK22-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %eax, %esi
 ; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 48(%edi), %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrl %esi
-; FALLBACK22-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK22-NEXT:    movl 52(%edi), %esi
 ; FALLBACK22-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shlxl %edx, %eax, %eax
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %ebp
 ; FALLBACK22-NEXT:    orl %eax, %ebp
@@ -18605,209 +18497,209 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ;
 ; FALLBACK24-LABEL: shl_64bytes:
 ; FALLBACK24:       # %bb.0:
-; FALLBACK24-NEXT:    pushl %ebp
-; FALLBACK24-NEXT:    pushl %ebx
-; FALLBACK24-NEXT:    pushl %edi
-; FALLBACK24-NEXT:    pushl %esi
-; FALLBACK24-NEXT:    subl $204, %esp
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
-; FALLBACK24-NEXT:    movl (%eax), %eax
-; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    andl $60, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT:    subl %edx, %ecx
-; FALLBACK24-NEXT:    movl (%ecx), %edi
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 4(%ecx), %edx
-; FALLBACK24-NEXT:    movl %ecx, %ebp
-; FALLBACK24-NEXT:    shll $3, %eax
-; FALLBACK24-NEXT:    andl $24, %eax
-; FALLBACK24-NEXT:    movl %edx, %esi
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movb %al, %ch
-; FALLBACK24-NEXT:    notb %ch
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %esi, %edi
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 12(%ebp), %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 8(%ebp), %esi
-; FALLBACK24-NEXT:    movl %ebp, %edi
-; FALLBACK24-NEXT:    movl %esi, %ebp
-; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
-; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    pushl %ebp
+; FALLBACK24-NEXT:    pushl %ebx
+; FALLBACK24-NEXT:    pushl %edi
+; FALLBACK24-NEXT:    pushl %esi
+; FALLBACK24-NEXT:    subl $204, %esp
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
+; FALLBACK24-NEXT:    movl (%eax), %ecx
+; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    andl $60, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; FALLBACK24-NEXT:    subl %eax, %ebp
+; FALLBACK24-NEXT:    movl (%ebp), %edx
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %edi, %ebp
-; FALLBACK24-NEXT:    movl 20(%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 16(%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movl 4(%ebp), %edi
+; FALLBACK24-NEXT:    shll $3, %ecx
+; FALLBACK24-NEXT:    andl $24, %ecx
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    shll %cl, %eax
 ; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %ecx, %ebx
+; FALLBACK24-NEXT:    movl %ecx, %esi
+; FALLBACK24-NEXT:    notb %bl
+; FALLBACK24-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movl 12(%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %esi, %ecx
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 8(%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %ebx
+; FALLBACK24-NEXT:    shrl %ebx
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %esi, %edi
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %ebp, %edx
-; FALLBACK24-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    orl %edx, %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 24(%ebp), %esi
-; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movl %esi, %ebx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
 ; FALLBACK24-NEXT:    shrl %edi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    orl %eax, %edi
 ; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl 20(%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl 16(%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, %eax
+; FALLBACK24-NEXT:    shrl %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    orl %esi, %ebp
-; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 36(%edx), %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 32(%edx), %esi
-; FALLBACK24-NEXT:    movl %edx, %ebp
-; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %edi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
 ; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl 28(%ebp), %edi
+; FALLBACK24-NEXT:    movl %edi, %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 24(%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %edx
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 44(%ebp), %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 40(%ebp), %esi
-; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %esi, %edx
-; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movl 36(%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movl 32(%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, %eax
+; FALLBACK24-NEXT:    shrl %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 44(%ebp), %edx
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 40(%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %eax
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %edx
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %esi, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 52(%ebp), %esi
-; FALLBACK24-NEXT:    movl %esi, %edi
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl 52(%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, %edi
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    shll %cl, %edi
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK24-NEXT:    negl %edx
-; FALLBACK24-NEXT:    movl 176(%esp,%edx), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    orl %edi, %ebp
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %edx
+; FALLBACK24-NEXT:    movl 176(%esp,%edx), %edx
+; FALLBACK24-NEXT:    movl %edx, %esi
+; FALLBACK24-NEXT:    shrl %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT:    movl 60(%edi), %edx
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %edi, %esi
+; FALLBACK24-NEXT:    movb %bl, %cl
 ; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    movl 56(%edi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %edi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    orl %edx, %edi
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 60(%ebp), %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movl 56(%ebp), %edx
+; FALLBACK24-NEXT:    movl %edx, %edi
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT:    movl %edx, (%eax)
-; FALLBACK24-NEXT:    movl %esi, 56(%eax)
-; FALLBACK24-NEXT:    movl %edi, 60(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 48(%eax)
-; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    movl %edx, (%ecx)
+; FALLBACK24-NEXT:    movl %eax, 56(%ecx)
+; FALLBACK24-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 48(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK24-NEXT:    addl $204, %esp
 ; FALLBACK24-NEXT:    popl %esi
 ; FALLBACK24-NEXT:    popl %edi
@@ -18952,7 +18844,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl (%edi), %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 4(%edi), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    notb %bl
 ; FALLBACK26-NEXT:    shrl %ecx
@@ -18963,78 +18854,77 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl 8(%edi), %esi
 ; FALLBACK26-NEXT:    movl %esi, %ecx
 ; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 12(%edi), %ecx
 ; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK26-NEXT:    shrl %eax
 ; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
 ; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 16(%edi), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 16(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, %eax
 ; FALLBACK26-NEXT:    shrl %eax
 ; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    movl 20(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 20(%edi), %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %eax, %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 24(%edi), %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 24(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, %ecx
 ; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 28(%edi), %ecx
 ; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 32(%edi), %eax
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 32(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, %eax
 ; FALLBACK26-NEXT:    shrl %eax
 ; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT:    movl 36(%edi), %esi
-; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 36(%edi), %eax
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    orl %eax, %ecx
-; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 40(%edi), %ecx
+; FALLBACK26-NEXT:    orl %esi, %ecx
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 40(%edi), %esi
+; FALLBACK26-NEXT:    movl %esi, %ecx
 ; FALLBACK26-NEXT:    shrl %ecx
-; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 44(%edi), %ecx
 ; FALLBACK26-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK26-NEXT:    shrl %eax
+; FALLBACK26-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK26-NEXT:    orl %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    movl 48(%edi), %eax
+; FALLBACK26-NEXT:    movl %eax, %esi
 ; FALLBACK26-NEXT:    shrl %esi
 ; FALLBACK26-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT:    orl %eax, %esi
-; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 48(%edi), %esi
 ; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrl %esi
-; FALLBACK26-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK26-NEXT:    movl 52(%edi), %esi
 ; FALLBACK26-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shlxl %edx, %eax, %eax
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %ebp
 ; FALLBACK26-NEXT:    orl %eax, %ebp
@@ -19210,198 +19100,198 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
-; FALLBACK28-NEXT:    movl (%eax), %eax
+; FALLBACK28-NEXT:    movl (%eax), %ecx
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    andl $60, %edx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    andl $60, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; FALLBACK28-NEXT:    subl %eax, %ebp
+; FALLBACK28-NEXT:    movl (%ebp), %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT:    subl %edx, %ecx
-; FALLBACK28-NEXT:    movl (%ecx), %edi
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 4(%ecx), %edx
-; FALLBACK28-NEXT:    movl %ecx, %ebp
-; FALLBACK28-NEXT:    shll $3, %eax
-; FALLBACK28-NEXT:    andl $24, %eax
-; FALLBACK28-NEXT:    movl %edx, %esi
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movb %al, %ch
-; FALLBACK28-NEXT:    notb %ch
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %esi, %edi
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 12(%ebp), %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 8(%ebp), %esi
-; FALLBACK28-NEXT:    movl %ebp, %edi
-; FALLBACK28-NEXT:    movl %esi, %ebp
-; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
-; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl 4(%ebp), %edi
+; FALLBACK28-NEXT:    shll $3, %ecx
+; FALLBACK28-NEXT:    andl $24, %ecx
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    shll %cl, %eax
 ; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %ecx, %ebx
+; FALLBACK28-NEXT:    movl %ecx, %esi
+; FALLBACK28-NEXT:    notb %bl
+; FALLBACK28-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %edi, %ebp
-; FALLBACK28-NEXT:    movl 20(%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 16(%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    movl 12(%ebp), %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movl %esi, %ecx
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 8(%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %ebx
+; FALLBACK28-NEXT:    shrl %ebx
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %esi, %edi
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %ebp, %edx
-; FALLBACK28-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    orl %edx, %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 24(%ebp), %esi
-; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movl %esi, %ebx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
 ; FALLBACK28-NEXT:    shrl %edi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    orl %eax, %edi
 ; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl 20(%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl 16(%ebp), %edx
+; FALLBACK28-NEXT:    movl %edx, %eax
+; FALLBACK28-NEXT:    shrl %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    orl %esi, %ebp
-; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 36(%edx), %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 32(%edx), %esi
-; FALLBACK28-NEXT:    movl %edx, %ebp
-; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %edi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
 ; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl 28(%ebp), %edi
+; FALLBACK28-NEXT:    movl %edi, %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 24(%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %edx
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 44(%ebp), %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 40(%ebp), %esi
-; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movl 36(%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movl 32(%ebp), %edx
+; FALLBACK28-NEXT:    movl %edx, %eax
+; FALLBACK28-NEXT:    shrl %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 44(%ebp), %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 40(%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %eax
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %edx
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %esi, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 52(%ebp), %esi
-; FALLBACK28-NEXT:    movl %esi, %edi
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl 52(%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, %edi
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    shll %cl, %edi
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK28-NEXT:    negl %edx
-; FALLBACK28-NEXT:    movl 176(%esp,%edx), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    orl %edi, %ebp
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %edx
+; FALLBACK28-NEXT:    movl 176(%esp,%edx), %edx
+; FALLBACK28-NEXT:    movl %edx, %esi
+; FALLBACK28-NEXT:    shrl %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT:    movl 60(%edi), %edx
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %edi, %esi
+; FALLBACK28-NEXT:    movb %bl, %cl
 ; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    movl 56(%edi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %edi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    orl %edx, %edi
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 60(%ebp), %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movl 56(%ebp), %edx
+; FALLBACK28-NEXT:    movl %edx, %edi
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT:    movl %edx, (%eax)
-; FALLBACK28-NEXT:    movl %esi, 56(%eax)
-; FALLBACK28-NEXT:    movl %edi, 60(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 48(%eax)
-; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    movl %edx, (%ecx)
+; FALLBACK28-NEXT:    movl %eax, 56(%ecx)
+; FALLBACK28-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 48(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 52(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK28-NEXT:    addl $204, %esp
 ; FALLBACK28-NEXT:    popl %esi
 ; FALLBACK28-NEXT:    popl %edi
@@ -19540,7 +19430,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl (%edi), %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 4(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl %edx, %ebx
 ; FALLBACK30-NEXT:    notb %bl
 ; FALLBACK30-NEXT:    shrl %ecx
@@ -19551,78 +19440,77 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl 8(%edi), %esi
 ; FALLBACK30-NEXT:    movl %esi, %ecx
 ; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 12(%edi), %ecx
 ; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK30-NEXT:    shrl %eax
 ; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
 ; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 16(%edi), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 16(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, %eax
 ; FALLBACK30-NEXT:    shrl %eax
 ; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    movl 20(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 20(%edi), %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %eax, %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 24(%edi), %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 24(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, %ecx
 ; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 28(%edi), %ecx
 ; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 32(%edi), %eax
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 32(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, %eax
 ; FALLBACK30-NEXT:    shrl %eax
 ; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT:    movl 36(%edi), %esi
-; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 36(%edi), %eax
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %ebp
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    orl %eax, %ecx
-; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 40(%edi), %ecx
+; FALLBACK30-NEXT:    orl %esi, %ecx
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 40(%edi), %esi
+; FALLBACK30-NEXT:    movl %esi, %ecx
 ; FALLBACK30-NEXT:    shrl %ecx
-; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 44(%edi), %ecx
 ; FALLBACK30-NEXT:    shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shlxl %edx, %esi, %esi
+; FALLBACK30-NEXT:    shrl %eax
+; FALLBACK30-NEXT:    shrxl %ebx, %eax, %eax
+; FALLBACK30-NEXT:    orl %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    movl 48(%edi), %eax
+; FALLBACK30-NEXT:    movl %eax, %esi
 ; FALLBACK30-NEXT:    shrl %esi
 ; FALLBACK30-NEXT:    shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT:    orl %eax, %esi
 ; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 48(%edi), %esi
-; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrl %esi
-; FALLBACK30-NEXT:    shrxl %ebx, %esi, %eax
 ; FALLBACK30-NEXT:    movl 52(%edi), %esi
 ; FALLBACK30-NEXT:    shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shlxl %edx, %eax, %eax
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %ebp
 ; FALLBACK30-NEXT:    orl %eax, %ebp
@@ -21444,262 +21332,253 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    pushl %edi
 ; FALLBACK16-NEXT:    pushl %esi
 ; FALLBACK16-NEXT:    subl $204, %esp
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT:    movl (%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 8(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 12(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 16(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 20(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 24(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 28(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 32(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 36(%ecx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 40(%ecx), %ebx
-; FALLBACK16-NEXT:    movl 44(%ecx), %edi
-; FALLBACK16-NEXT:    movl 48(%ecx), %esi
-; FALLBACK16-NEXT:    movl 52(%ecx), %edx
-; FALLBACK16-NEXT:    movl 56(%ecx), %eax
-; FALLBACK16-NEXT:    movl 60(%ecx), %ecx
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    movl (%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 8(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 12(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 16(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 20(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 24(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 32(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 36(%eax), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 40(%eax), %ebx
+; FALLBACK16-NEXT:    movl 44(%eax), %edi
+; FALLBACK16-NEXT:    movl 48(%eax), %esi
+; FALLBACK16-NEXT:    movl 52(%eax), %edx
+; FALLBACK16-NEXT:    movl 56(%eax), %ecx
+; FALLBACK16-NEXT:    movl 60(%eax), %eax
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; FALLBACK16-NEXT:    movl (%ebp), %ebp
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    sarl $31, %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebp, %ecx
-; FALLBACK16-NEXT:    movl %ebp, %esi
-; FALLBACK16-NEXT:    andl $60, %esi
-; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK16-NEXT:    shll $3, %ecx
-; FALLBACK16-NEXT:    andl $24, %ecx
-; FALLBACK16-NEXT:    movl %edx, %eax
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 72(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    addl %edi, %edi
-; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT:    movl %ecx, %ebx
-; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    sarl $31, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movl %ebp, %ebx
+; FALLBACK16-NEXT:    andl $60, %ebx
+; FALLBACK16-NEXT:    movl 68(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    shll $3, %eax
+; FALLBACK16-NEXT:    andl $24, %eax
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 72(%esp,%ebx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK16-NEXT:    movb %al, %ch
 ; FALLBACK16-NEXT:    notb %ch
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 64(%esp,%esi), %eax
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    addl %edx, %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %eax, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 76(%esp,%esi), %ebp
-; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl 64(%esp,%ebx), %edx
+; FALLBACK16-NEXT:    movb %al, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %edx, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %esi, %edx
-; FALLBACK16-NEXT:    movl 84(%esp,%esi), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 88(%esp,%esi), %esi
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %edx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 80(%esp,%ebx), %esi
 ; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    orl %edx, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    addl %ebx, %ebx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %edx, %eax
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 92(%esp,%edx), %ebp
-; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    movl 96(%esp,%eax), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %edx, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %esi
-; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %edx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %edx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 88(%esp,%ebx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    orl %edx, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 100(%esp,%edx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 104(%esp,%edx), %esi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 92(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %edx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 96(%esp,%ebx), %esi
 ; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    orl %edx, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    addl %ebx, %ebx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK16-NEXT:    movl 108(%esp,%ebp), %edi
-; FALLBACK16-NEXT:    movl %edi, %eax
-; FALLBACK16-NEXT:    movl %edx, %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 112(%esp,%ebp), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
-; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %esi
 ; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %esi, %edi
-; FALLBACK16-NEXT:    movl 116(%esp,%edx), %esi
-; FALLBACK16-NEXT:    movl %esi, %eax
-; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 100(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 120(%esp,%edx), %edx
-; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
+; FALLBACK16-NEXT:    movl 104(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %edx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 108(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %edx
+; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 112(%esp,%ebx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %eax, %edx
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edi
 ; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    orl %eax, %esi
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    orl %edi, %esi
+; FALLBACK16-NEXT:    movl 116(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    movl 120(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    leal (%ebp,%ebp), %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edx
 ; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %ebp
+; FALLBACK16-NEXT:    movl 124(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    leal (%eax,%eax), %ebx
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    orl %ebp, %ebx
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK16-NEXT:    sarl %cl, %ebx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK16-NEXT:    movl %edx, 56(%eax)
-; FALLBACK16-NEXT:    movl %esi, 48(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK16-NEXT:    movl %edi, 40(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, (%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    sarl %cl, %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %eax, 60(%ecx)
+; FALLBACK16-NEXT:    movl %ebx, 56(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK16-NEXT:    movl %edx, 52(%ecx)
+; FALLBACK16-NEXT:    movl %esi, 40(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, (%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK16-NEXT:    addl $204, %esp
 ; FALLBACK16-NEXT:    popl %esi
 ; FALLBACK16-NEXT:    popl %edi
@@ -21957,20 +21836,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK18-NEXT:    andl $24, %edx
-; FALLBACK18-NEXT:    andl $60, %ecx
-; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
-; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    andl $60, %eax
+; FALLBACK18-NEXT:    movl 68(%esp,%eax), %esi
+; FALLBACK18-NEXT:    movl 72(%esp,%eax), %ecx
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %eax, %ecx
+; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK18-NEXT:    movl %edx, %ebx
 ; FALLBACK18-NEXT:    notb %bl
-; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %edi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    addl %esi, %esi
@@ -21990,61 +21869,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 84(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    orl %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 92(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK18-NEXT:    addl %edi, %edi
 ; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK18-NEXT:    orl %eax, %edi
 ; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl 100(%esp,%ecx), %edi
 ; FALLBACK18-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK18-NEXT:    addl %edi, %edi
-; FALLBACK18-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT:    orl %esi, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK18-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK18-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK18-NEXT:    orl %esi, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    movl 112(%esp,%ecx), %esi
+; FALLBACK18-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    addl %esi, %esi
+; FALLBACK18-NEXT:    shlxl %ebx, %esi, %edi
 ; FALLBACK18-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT:    movl %ecx, %edi
-; FALLBACK18-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT:    orl %ebp, %eax
-; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK18-NEXT:    orl %ebp, %edi
+; FALLBACK18-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK18-NEXT:    shrxl %edx, %eax, %ecx
 ; FALLBACK18-NEXT:    addl %esi, %esi
 ; FALLBACK18-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK18-NEXT:    orl %ecx, %esi
-; FALLBACK18-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK18-NEXT:    movl 120(%esp,%eax), %ebp
 ; FALLBACK18-NEXT:    leal (%ebp,%ebp), %ecx
 ; FALLBACK18-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK18-NEXT:    movl 116(%esp,%eax), %eax
 ; FALLBACK18-NEXT:    shrxl %edx, %eax, %edi
 ; FALLBACK18-NEXT:    orl %edi, %ecx
 ; FALLBACK18-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -22270,216 +22147,209 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    pushl %edi
 ; FALLBACK20-NEXT:    pushl %esi
 ; FALLBACK20-NEXT:    subl $204, %esp
-; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT:    movups (%ecx), %xmm0
-; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
-; FALLBACK20-NEXT:    movl 48(%ecx), %edx
-; FALLBACK20-NEXT:    movl 52(%ecx), %esi
-; FALLBACK20-NEXT:    movl 56(%ecx), %edi
-; FALLBACK20-NEXT:    movl 60(%ecx), %ecx
-; FALLBACK20-NEXT:    movl (%eax), %eax
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK20-NEXT:    movups (%eax), %xmm0
+; FALLBACK20-NEXT:    movups 16(%eax), %xmm1
+; FALLBACK20-NEXT:    movups 32(%eax), %xmm2
+; FALLBACK20-NEXT:    movl 48(%eax), %edx
+; FALLBACK20-NEXT:    movl 52(%eax), %esi
+; FALLBACK20-NEXT:    movl 56(%eax), %edi
+; FALLBACK20-NEXT:    movl 60(%eax), %eax
+; FALLBACK20-NEXT:    movl (%ecx), %ebx
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    sarl $31, %ecx
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %eax, %esi
-; FALLBACK20-NEXT:    andl $60, %esi
-; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK20-NEXT:    shll $3, %eax
-; FALLBACK20-NEXT:    andl $24, %eax
-; FALLBACK20-NEXT:    movl %edx, %edi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
-; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    sarl $31, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK20-NEXT:    movl %ebx, %ebp
+; FALLBACK20-NEXT:    andl $60, %ebp
+; FALLBACK20-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    shll $3, %ebx
+; FALLBACK20-NEXT:    andl $24, %ebx
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK20-NEXT:    movb %bl, %ch
 ; FALLBACK20-NEXT:    notb %ch
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %edx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movl 80(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %edi, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK20-NEXT:    movl %edx, %ebp
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %esi, %esi
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    addl %eax, %eax
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %esi, %esi
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl 96(%esp,%ebp), %edx
+; FALLBACK20-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %esi, %esi
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 100(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %eax, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    addl %esi, %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 108(%esp,%ebp), %esi
+; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 112(%esp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    addl %eax, %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    orl %edx, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movb %bl, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    addl %esi, %esi
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movl %edi, %ebp
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %edi, %esi
+; FALLBACK20-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 120(%esp,%ebp), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
 ; FALLBACK20-NEXT:    addl %edi, %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    orl %edx, %edi
-; FALLBACK20-NEXT:    movl %esi, %edx
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK20-NEXT:    movl %esi, %ebx
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK20-NEXT:    orl %eax, %edi
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK20-NEXT:    movl 124(%esp,%ebp), %eax
 ; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %eax, %edx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK20-NEXT:    sarl %cl, %ebx
-; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK20-NEXT:    movl %edx, 56(%eax)
-; FALLBACK20-NEXT:    movl %esi, 48(%eax)
-; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK20-NEXT:    movl %edi, 40(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, (%eax)
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK20-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK20-NEXT:    movl %ebx, %ecx
+; FALLBACK20-NEXT:    sarl %cl, %eax
+; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK20-NEXT:    movl %eax, 60(%ecx)
+; FALLBACK20-NEXT:    movl %ebp, 56(%ecx)
+; FALLBACK20-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK20-NEXT:    movl %edx, 52(%ecx)
+; FALLBACK20-NEXT:    movl %esi, 40(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, (%ecx)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK20-NEXT:    addl $204, %esp
 ; FALLBACK20-NEXT:    popl %esi
 ; FALLBACK20-NEXT:    popl %edi
@@ -22661,20 +22531,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    movl %eax, %ecx
 ; FALLBACK22-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK22-NEXT:    andl $24, %edx
-; FALLBACK22-NEXT:    andl $60, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    andl $60, %eax
+; FALLBACK22-NEXT:    movl 68(%esp,%eax), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%eax), %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %eax, %ecx
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK22-NEXT:    movl %edx, %ebx
 ; FALLBACK22-NEXT:    notb %bl
-; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
-; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %edi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
@@ -22694,61 +22564,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
 ; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl %ecx, %edi
+; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK22-NEXT:    orl %esi, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    addl %esi, %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %edi
+; FALLBACK22-NEXT:    movl 108(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    orl %ebp, %edi
+; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %edx, %eax, %ecx
 ; FALLBACK22-NEXT:    addl %esi, %esi
 ; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
 ; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
 ; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
 ; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
 ; FALLBACK22-NEXT:    orl %edi, %ecx
 ; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -22936,214 +22804,207 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    pushl %edi
 ; FALLBACK24-NEXT:    pushl %esi
 ; FALLBACK24-NEXT:    subl $204, %esp
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT:    vmovups 32(%ecx), %xmm1
-; FALLBACK24-NEXT:    movl 48(%ecx), %edx
-; FALLBACK24-NEXT:    movl 52(%ecx), %esi
-; FALLBACK24-NEXT:    movl 56(%ecx), %edi
-; FALLBACK24-NEXT:    movl 60(%ecx), %ecx
-; FALLBACK24-NEXT:    movl (%eax), %eax
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK24-NEXT:    movl 48(%eax), %edx
+; FALLBACK24-NEXT:    movl 52(%eax), %esi
+; FALLBACK24-NEXT:    movl 56(%eax), %edi
+; FALLBACK24-NEXT:    movl 60(%eax), %eax
+; FALLBACK24-NEXT:    movl (%ecx), %ebx
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    sarl $31, %ecx
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %eax, %esi
-; FALLBACK24-NEXT:    andl $60, %esi
-; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK24-NEXT:    shll $3, %eax
-; FALLBACK24-NEXT:    andl $24, %eax
-; FALLBACK24-NEXT:    movl %edx, %edi
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    movl 72(%esp,%esi), %ecx
-; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    sarl $31, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ebx, %ebp
+; FALLBACK24-NEXT:    andl $60, %ebp
+; FALLBACK24-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    shll $3, %ebx
+; FALLBACK24-NEXT:    andl $24, %ebx
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK24-NEXT:    movb %bl, %ch
 ; FALLBACK24-NEXT:    notb %ch
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %edx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movl 80(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %edi, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK24-NEXT:    movl %edx, %ebp
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %esi, %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    addl %eax, %eax
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %esi, %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl 96(%esp,%ebp), %edx
+; FALLBACK24-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %esi, %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 100(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    movl %ebx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %eax, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    addl %esi, %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 108(%esp,%ebp), %esi
+; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 112(%esp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    addl %eax, %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movb %bl, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    addl %esi, %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movl %edi, %ebp
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %edi, %esi
+; FALLBACK24-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 120(%esp,%ebp), %ecx
 ; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
 ; FALLBACK24-NEXT:    addl %edi, %edi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    orl %edx, %edi
-; FALLBACK24-NEXT:    movl %esi, %edx
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK24-NEXT:    movl %esi, %ebx
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK24-NEXT:    orl %eax, %edi
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK24-NEXT:    movl 124(%esp,%ebp), %eax
 ; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %eax, %edx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK24-NEXT:    sarl %cl, %ebx
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK24-NEXT:    movl %edx, 56(%eax)
-; FALLBACK24-NEXT:    movl %esi, 48(%eax)
-; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK24-NEXT:    movl %edi, 40(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, (%eax)
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK24-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK24-NEXT:    movl %ebx, %ecx
+; FALLBACK24-NEXT:    sarl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK24-NEXT:    movl %eax, 60(%ecx)
+; FALLBACK24-NEXT:    movl %ebp, 56(%ecx)
+; FALLBACK24-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK24-NEXT:    movl %edx, 52(%ecx)
+; FALLBACK24-NEXT:    movl %esi, 40(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, (%ecx)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK24-NEXT:    addl $204, %esp
 ; FALLBACK24-NEXT:    popl %esi
 ; FALLBACK24-NEXT:    popl %edi
@@ -23323,20 +23184,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT:    movl %eax, %ecx
 ; FALLBACK26-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK26-NEXT:    andl $24, %edx
-; FALLBACK26-NEXT:    andl $60, %ecx
-; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
-; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    andl $60, %eax
+; FALLBACK26-NEXT:    movl 68(%esp,%eax), %esi
+; FALLBACK26-NEXT:    movl 72(%esp,%eax), %ecx
+; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl %eax, %ecx
+; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK26-NEXT:    movl %edx, %ebx
 ; FALLBACK26-NEXT:    notb %bl
-; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
-; FALLBACK26-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %edi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    addl %esi, %esi
@@ -23356,61 +23217,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 84(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 92(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK26-NEXT:    addl %edi, %edi
 ; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK26-NEXT:    orl %eax, %edi
 ; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl 100(%esp,%ecx), %edi
 ; FALLBACK26-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK26-NEXT:    addl %edi, %edi
-; FALLBACK26-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT:    orl %esi, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK26-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT:    movl %ecx, %edi
+; FALLBACK26-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK26-NEXT:    orl %esi, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    movl 112(%esp,%ecx), %esi
+; FALLBACK26-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    addl %esi, %esi
+; FALLBACK26-NEXT:    shlxl %ebx, %esi, %edi
+; FALLBACK26-NEXT:    movl 108(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT:    orl %ebp, %eax
-; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK26-NEXT:    orl %ebp, %edi
+; FALLBACK26-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK26-NEXT:    shrxl %edx, %eax, %ecx
 ; FALLBACK26-NEXT:    addl %esi, %esi
 ; FALLBACK26-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK26-NEXT:    orl %ecx, %esi
-; FALLBACK26-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK26-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK26-NEXT:    movl 120(%esp,%eax), %ebp
 ; FALLBACK26-NEXT:    leal (%ebp,%ebp), %ecx
 ; FALLBACK26-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK26-NEXT:    movl 116(%esp,%eax), %eax
 ; FALLBACK26-NEXT:    shrxl %edx, %eax, %edi
 ; FALLBACK26-NEXT:    orl %edi, %ecx
 ; FALLBACK26-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
@@ -23598,214 +23457,207 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    pushl %edi
 ; FALLBACK28-NEXT:    pushl %esi
 ; FALLBACK28-NEXT:    subl $204, %esp
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT:    vmovups 32(%ecx), %xmm1
-; FALLBACK28-NEXT:    movl 48(%ecx), %edx
-; FALLBACK28-NEXT:    movl 52(%ecx), %esi
-; FALLBACK28-NEXT:    movl 56(%ecx), %edi
-; FALLBACK28-NEXT:    movl 60(%ecx), %ecx
-; FALLBACK28-NEXT:    movl (%eax), %eax
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK28-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK28-NEXT:    movl 48(%eax), %edx
+; FALLBACK28-NEXT:    movl 52(%eax), %esi
+; FALLBACK28-NEXT:    movl 56(%eax), %edi
+; FALLBACK28-NEXT:    movl 60(%eax), %eax
+; FALLBACK28-NEXT:    movl (%ecx), %ebx
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    sarl $31, %ecx
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %eax, %esi
-; FALLBACK28-NEXT:    andl $60, %esi
-; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK28-NEXT:    shll $3, %eax
-; FALLBACK28-NEXT:    andl $24, %eax
-; FALLBACK28-NEXT:    movl %edx, %edi
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    movl 72(%esp,%esi), %ecx
-; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    sarl $31, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ebx, %ebp
+; FALLBACK28-NEXT:    andl $60, %ebp
+; FALLBACK28-NEXT:    movl 68(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    shll $3, %ebx
+; FALLBACK28-NEXT:    andl $24, %ebx
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 72(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %edi
+; FALLBACK28-NEXT:    movb %bl, %ch
 ; FALLBACK28-NEXT:    notb %ch
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %edx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movl 80(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %edi, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK28-NEXT:    movl %edx, %ebp
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %esi, %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl 88(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    addl %eax, %eax
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %esi, %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl 96(%esp,%ebp), %edx
+; FALLBACK28-NEXT:    leal (%edx,%edx), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %esi, %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 100(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    movl %ebx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movl 104(%esp,%ebp), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %eax, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    addl %esi, %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 108(%esp,%ebp), %esi
+; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 112(%esp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    addl %eax, %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movb %bl, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    addl %esi, %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movl %edi, %ebp
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %edi, %esi
+; FALLBACK28-NEXT:    movl 116(%esp,%ebp), %edi
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 120(%esp,%ebp), %ecx
 ; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edx
 ; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
 ; FALLBACK28-NEXT:    addl %edi, %edi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    orl %edx, %edi
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK28-NEXT:    movl %esi, %ebx
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
+; FALLBACK28-NEXT:    orl %eax, %edi
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK28-NEXT:    movl 124(%esp,%ebp), %eax
 ; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %eax, %edx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK28-NEXT:    sarl %cl, %ebx
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK28-NEXT:    movl %edx, 56(%eax)
-; FALLBACK28-NEXT:    movl %esi, 48(%eax)
-; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK28-NEXT:    movl %edi, 40(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, (%eax)
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK28-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK28-NEXT:    movl %ebx, %ecx
+; FALLBACK28-NEXT:    sarl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK28-NEXT:    movl %eax, 60(%ecx)
+; FALLBACK28-NEXT:    movl %ebp, 56(%ecx)
+; FALLBACK28-NEXT:    movl %edi, 48(%ecx)
+; FALLBACK28-NEXT:    movl %edx, 52(%ecx)
+; FALLBACK28-NEXT:    movl %esi, 40(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, (%ecx)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK28-NEXT:    addl $204, %esp
 ; FALLBACK28-NEXT:    popl %esi
 ; FALLBACK28-NEXT:    popl %edi
@@ -23985,20 +23837,20 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT:    movl %eax, %ecx
 ; FALLBACK30-NEXT:    leal (,%eax,8), %edx
 ; FALLBACK30-NEXT:    andl $24, %edx
-; FALLBACK30-NEXT:    andl $60, %ecx
-; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
-; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    andl $60, %eax
+; FALLBACK30-NEXT:    movl 68(%esp,%eax), %esi
+; FALLBACK30-NEXT:    movl 72(%esp,%eax), %ecx
+; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl %eax, %ecx
+; FALLBACK30-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK30-NEXT:    movl %edx, %ebx
 ; FALLBACK30-NEXT:    notb %bl
-; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
-; FALLBACK30-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %edi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    addl %esi, %esi
@@ -24018,61 +23870,59 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 88(%esp,%ecx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 84(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 96(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 92(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %eax
 ; FALLBACK30-NEXT:    addl %edi, %edi
 ; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK30-NEXT:    orl %eax, %edi
 ; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 104(%esp,%ecx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl 100(%esp,%ecx), %edi
 ; FALLBACK30-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; FALLBACK30-NEXT:    shrxl %edx, %esi, %esi
 ; FALLBACK30-NEXT:    addl %edi, %edi
-; FALLBACK30-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT:    orl %esi, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    leal (%eax,%eax), %esi
-; FALLBACK30-NEXT:    shlxl %ebx, %esi, %eax
-; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
-; FALLBACK30-NEXT:    movl %ecx, %edi
+; FALLBACK30-NEXT:    shlxl %ebx, %edi, %edi
+; FALLBACK30-NEXT:    orl %esi, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    movl 112(%esp,%ecx), %esi
+; FALLBACK30-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    addl %esi, %esi
+; FALLBACK30-NEXT:    shlxl %ebx, %esi, %edi
+; FALLBACK30-NEXT:    movl 108(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT:    orl %ebp, %eax
-; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK30-NEXT:    orl %ebp, %edi
+; FALLBACK30-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK30-NEXT:    shrxl %edx, %eax, %ecx
 ; FALLBACK30-NEXT:    addl %esi, %esi
 ; FALLBACK30-NEXT:    shlxl %ebx, %esi, %esi
 ; FALLBACK30-NEXT:    orl %ecx, %esi
-; FALLBACK30-NEXT:    movl 120(%esp,%edi), %ebp
+; FALLBACK30-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK30-NEXT:    movl 120(%esp,%eax), %ebp
 ; FALLBACK30-NEXT:    leal (%ebp,%ebp), %ecx
 ; FALLBACK30-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT:    movl 116(%esp,%edi), %eax
+; FALLBACK30-NEXT:    movl 116(%esp,%eax), %eax
 ; FALLBACK30-NEXT:    shrxl %edx, %eax, %edi
 ; FALLBACK30-NEXT:    orl %edi, %ecx
 ; FALLBACK30-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 43d2a997c81d21..6e0be0c2dd6f21 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -591,54 +591,55 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 12(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -879,55 +880,55 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $60, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    negb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    negb %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 8(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 12(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%eax)
@@ -1167,57 +1168,58 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ecx), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ecx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ecx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%eax), %dh
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, (%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esp,%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dh, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%ebp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%ebp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 8(%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, (%edx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 4(%edx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $44, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -1553,17 +1555,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
@@ -1571,91 +1573,83 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebx,4), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2109,17 +2103,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%ecx), %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
@@ -2127,100 +2121,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $28, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    negb %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movsbl %al, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -2234,18 +2229,18 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebp), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebp), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebp), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebp), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebp), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebp), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
@@ -2253,36 +2248,36 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $3, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andb $28, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    negb %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movsbl %al, %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %eax, %edx
@@ -2295,12 +2290,12 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 4(%eax)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -2348,10 +2343,8 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
@@ -2361,33 +2354,29 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebp, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, 92(%esp,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %esi, %eax
@@ -2679,115 +2668,108 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edx), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%ecx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %al
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %al
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebp,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebp,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %al, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %ch
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebp,4), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebx,4), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx,4), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx,4), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx,4), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%eax,4), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ebx,4), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 16(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 20(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -2809,28 +2791,27 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $108, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%edx), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%edx), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%edx), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%edx), %ebx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%edx), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%edx), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%edx), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%edx), %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %edx
@@ -2842,32 +2823,33 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrb $5, %al
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movzbl %al, %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%esp,%ebp,4), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp,4), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%esp,%ebp,4), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp,4), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp,4), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%esp,%ebp,4), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%esp,%ebp,4), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%esp,%ebp,4), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebp,4), %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl %cl, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
@@ -2880,7 +2862,7 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, (%ebp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $92, %esp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    addl $108, %esp
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    popl %ebx
@@ -3423,35 +3405,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -3459,7 +3441,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3479,198 +3461,188 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -3865,148 +3837,141 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
@@ -4015,7 +3980,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4549,35 +4514,35 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -4585,7 +4550,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4601,7 +4566,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -4609,174 +4574,173 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    negl %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %edi # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 52(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4990,7 +4954,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
@@ -5011,7 +4975,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5042,7 +5006,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
@@ -5053,92 +5016,87 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%ecx), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
@@ -5161,7 +5119,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
@@ -5704,9 +5662,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
@@ -5723,19 +5681,19 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -5753,10 +5711,10 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    sarl $31, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5774,194 +5732,184 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edx), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 56(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 48(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -6168,42 +6116,40 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -6221,122 +6167,111 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index fbbf2a6c127a53..30378c144dbd87 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -2755,48 +2755,46 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%edi), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
@@ -3143,10 +3141,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -3156,80 +3154,78 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    notb %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%ebx), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebx), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 28(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 2169b39b9dfa05..87ecf14dc4828e 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -468,25 +468,27 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %cl
 ; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
 ; WIN32-NEXT:    orb %cl, %ch
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
 ; WIN32-NEXT:    orb %ch, %cl
@@ -494,6 +496,7 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
 ; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -567,14 +570,14 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
 ; WIN32-NEXT:    pushl %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %ebp, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
-; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    movl %eax, %ebx
+; WIN32-NEXT:    movl %esi, %edi
+; WIN32-NEXT:    movl %esi, %ebx
 ; WIN32-NEXT:    imull %ecx, %edi
-; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %esi
 ; WIN32-NEXT:    movl %eax, %ecx
@@ -583,7 +586,7 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    imull {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT:    imull %ebp, %edi
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    addl %edi, %ebx
@@ -593,10 +596,11 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    adcl %esi, %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl %edi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    mull %ecx
 ; WIN32-NEXT:    movl %edx, %ebp
 ; WIN32-NEXT:    movl %eax, %ecx
 ; WIN32-NEXT:    addl %esi, %ecx
@@ -704,30 +708,29 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT:    testl %ebp, %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    testl %ebx, %ebx
 ; WIN32-NEXT:    setne %al
 ; WIN32-NEXT:    testl %esi, %esi
-; WIN32-NEXT:    setne %bl
-; WIN32-NEXT:    andb %al, %bl
+; WIN32-NEXT:    setne %dl
+; WIN32-NEXT:    andb %al, %dl
+; WIN32-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %edi
-; WIN32-NEXT:    movl %edi, %edx
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
 ; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    movl %edx, %ebp
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull %ecx
-; WIN32-NEXT:    seto %bh
-; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT:    orb %bl, %bh
+; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; WIN32-NEXT:    addl %eax, %edi
 ; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %edi, %edx
 ; WIN32-NEXT:    setb %al
-; WIN32-NEXT:    orb %bh, %al
+; WIN32-NEXT:    orb %bl, %al
 ; WIN32-NEXT:    testb %al, %al
 ; WIN32-NEXT:    jne LBB14_2
 ; WIN32-NEXT:  # %bb.1:
@@ -1305,25 +1308,27 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
 ; WIN32-NEXT:    setne %cl
 ; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
+; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
 ; WIN32-NEXT:    orb %cl, %ch
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %ebx, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %al
 ; WIN32-NEXT:    orb %ch, %al
@@ -1332,6 +1337,7 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
 ; WIN32-NEXT:  # %bb.3: # %continue
 ; WIN32-NEXT:    movb $1, %al
 ; WIN32-NEXT:  LBB22_2: # %overflow
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -1679,23 +1685,22 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
-; WIN32-NEXT:    subl $20, %esp
+; WIN32-NEXT:    subl $12, %esp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebx
-; WIN32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl 4(%eax), %ebp
+; WIN32-NEXT:    movl (%eax), %ebp
+; WIN32-NEXT:    movl 4(%eax), %esi
 ; WIN32-NEXT:    movl %ecx, %eax
 ; WIN32-NEXT:    movl %ecx, %edi
 ; WIN32-NEXT:    sarl $31, %eax
 ; WIN32-NEXT:    movl %eax, %ecx
-; WIN32-NEXT:    imull %ebp, %ecx
-; WIN32-NEXT:    mull %ebx
+; WIN32-NEXT:    imull %esi, %ecx
+; WIN32-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    addl %ecx, %ebx
-; WIN32-NEXT:    movl %ebp, %ecx
-; WIN32-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %esi, %ecx
 ; WIN32-NEXT:    sarl $31, %ecx
 ; WIN32-NEXT:    movl %edi, %esi
 ; WIN32-NEXT:    imull %ecx, %esi
@@ -1709,19 +1714,18 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    addl %eax, %ecx
 ; WIN32-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; WIN32-NEXT:    adcl %ebx, %edi
-; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    mull %esi
-; WIN32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT:    movl %ebp, %eax
+; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; WIN32-NEXT:    mull %esi
 ; WIN32-NEXT:    movl %edx, %ebx
 ; WIN32-NEXT:    movl %eax, %esi
-; WIN32-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; WIN32-NEXT:    addl %ecx, %esi
 ; WIN32-NEXT:    adcl $0, %ebx
-; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    movl %ebp, %eax
 ; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
 ; WIN32-NEXT:    movl %edx, %ecx
 ; WIN32-NEXT:    movl %eax, %ebp
@@ -1745,7 +1749,7 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; WIN32-NEXT:    movl %ecx, (%eax)
 ; WIN32-NEXT:    setne %al
-; WIN32-NEXT:    addl $20, %esp
+; WIN32-NEXT:    addl $12, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2204,33 +2208,36 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT:    movl (%eax), %ebp
+; WIN32-NEXT:    movl (%eax), %ecx
 ; WIN32-NEXT:    movl 4(%eax), %eax
 ; WIN32-NEXT:    testl %esi, %esi
 ; WIN32-NEXT:    setne %dl
 ; WIN32-NEXT:    testl %eax, %eax
-; WIN32-NEXT:    setne %cl
-; WIN32-NEXT:    andb %dl, %cl
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    setne %bl
+; WIN32-NEXT:    andb %dl, %bl
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull %ebp
-; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
-; WIN32-NEXT:    orb %cl, %ch
+; WIN32-NEXT:    mull %ecx
+; WIN32-NEXT:    seto %bh
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
+; WIN32-NEXT:    orb %bl, %bh
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl %ebp, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
-; WIN32-NEXT:    orb %ch, %cl
+; WIN32-NEXT:    orb %bh, %cl
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
 ; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
@@ -2281,6 +2288,8 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    pushl %ebx
 ; WIN32-NEXT:    pushl %edi
 ; WIN32-NEXT:    pushl %esi
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT:    movl (%ecx), %ebp
@@ -2292,14 +2301,14 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    andb %dl, %cl
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    movl %eax, %edi
-; WIN32-NEXT:    seto %bl
+; WIN32-NEXT:    seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; WIN32-NEXT:    movl %esi, %eax
-; WIN32-NEXT:    mull {{[0-9]+}}(%esp)
+; WIN32-NEXT:    mull %ebx
 ; WIN32-NEXT:    seto %ch
-; WIN32-NEXT:    orb %bl, %ch
+; WIN32-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Folded Reload
 ; WIN32-NEXT:    orb %cl, %ch
 ; WIN32-NEXT:    leal (%edi,%eax), %esi
-; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    movl %ebx, %eax
 ; WIN32-NEXT:    mull %ebp
 ; WIN32-NEXT:    addl %esi, %edx
 ; WIN32-NEXT:    setb %cl
@@ -2308,6 +2317,7 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
 ; WIN32-NEXT:    movl %eax, (%esi)
 ; WIN32-NEXT:    movl %edx, 4(%esi)
 ; WIN32-NEXT:    movl %ecx, %eax
+; WIN32-NEXT:    addl $4, %esp
 ; WIN32-NEXT:    popl %esi
 ; WIN32-NEXT:    popl %edi
 ; WIN32-NEXT:    popl %ebx
diff --git a/llvm/test/DebugInfo/COFF/fpo-csrs.ll b/llvm/test/DebugInfo/COFF/fpo-csrs.ll
index c5b1dc996f3ba5..2eec9abff884f9 100644
--- a/llvm/test/DebugInfo/COFF/fpo-csrs.ll
+++ b/llvm/test/DebugInfo/COFF/fpo-csrs.ll
@@ -456,8 +456,8 @@ entry:
 ; ASM:         .cv_fpo_pushreg %edi
 ; ASM:         pushl   %esi
 ; ASM:         .cv_fpo_pushreg %esi
-; ASM:         subl    $8, %esp
-; ASM:         .cv_fpo_stackalloc 8
+; ASM:         pushl	%eax
+; ASM:         .cv_fpo_stackalloc 4
 ; ASM:         .cv_fpo_endprologue
 ; ASM:         retl
 ; ASM:         .cv_fpo_endproc
@@ -467,11 +467,11 @@ entry:
 ; OBJ-NEXT: LinkageName: _spill
 ; OBJ-NEXT: FrameData {
 ; OBJ-NEXT:   RvaStart: 0x0
-; OBJ-NEXT:   CodeSize: 0x5A
+; OBJ-NEXT:   CodeSize: 0x53
 ; OBJ-NEXT:   LocalSize: 0x0
 ; OBJ-NEXT:   ParamsSize: 0x0
 ; OBJ-NEXT:   MaxStackSize: 0x0
-; OBJ-NEXT:   PrologSize: 0x7
+; OBJ-NEXT:   PrologSize: 0x5
 ; OBJ-NEXT:   SavedRegsSize: 0x0
 ; OBJ-NEXT:   Flags [ (0x4)
 ; OBJ-NEXT:     IsFunctionStart (0x4)
@@ -484,11 +484,11 @@ entry:
 ; OBJ-NEXT: }
 ; OBJ-NEXT: FrameData {
 ; OBJ-NEXT:   RvaStart: 0x1
-; OBJ-NEXT:   CodeSize: 0x59
+; OBJ-NEXT:   CodeSize: 0x52
 ; OBJ-NEXT:   LocalSize: 0x0
 ; OBJ-NEXT:   ParamsSize: 0x0
 ; OBJ-NEXT:   MaxStackSize: 0x0
-; OBJ-NEXT:   PrologSize: 0x6
+; OBJ-NEXT:   PrologSize: 0x4
 ; OBJ-NEXT:   SavedRegsSize: 0x4
 ; OBJ-NEXT:   Flags [ (0x0)
 ; OBJ-NEXT:   ]
@@ -501,11 +501,11 @@ entry:
 ; OBJ-NEXT: }
 ; OBJ-NEXT: FrameData {
 ; OBJ-NEXT:   RvaStart: 0x2
-; OBJ-NEXT:   CodeSize: 0x58
+; OBJ-NEXT:   CodeSize: 0x51
 ; OBJ-NEXT:   LocalSize: 0x0
 ; OBJ-NEXT:   ParamsSize: 0x0
 ; OBJ-NEXT:   MaxStackSize: 0x0
-; OBJ-NEXT:   PrologSize: 0x5
+; OBJ-NEXT:   PrologSize: 0x3
 ; OBJ-NEXT:   SavedRegsSize: 0x8
 ; OBJ-NEXT:   Flags [ (0x0)
 ; OBJ-NEXT:   ]
@@ -519,11 +519,11 @@ entry:
 ; OBJ-NEXT: }
 ; OBJ-NEXT: FrameData {
 ; OBJ-NEXT:   RvaStart: 0x3
-; OBJ-NEXT:   CodeSize: 0x57
+; OBJ-NEXT:   CodeSize: 0x50
 ; OBJ-NEXT:   LocalSize: 0x0
 ; OBJ-NEXT:   ParamsSize: 0x0
 ; OBJ-NEXT:   MaxStackSize: 0x0
-; OBJ-NEXT:   PrologSize: 0x4
+; OBJ-NEXT:   PrologSize: 0x2
 ; OBJ-NEXT:   SavedRegsSize: 0xC
 ; OBJ-NEXT:   Flags [ (0x0)
 ; OBJ-NEXT:   ]
@@ -538,11 +538,11 @@ entry:
 ; OBJ-NEXT: }
 ; OBJ-NEXT: FrameData {
 ; OBJ-NEXT:   RvaStart: 0x4
-; OBJ-NEXT:   CodeSize: 0x56
+; OBJ-NEXT:   CodeSize: 0x4F
 ; OBJ-NEXT:   LocalSize: 0x0
 ; OBJ-NEXT:   ParamsSize: 0x0
 ; OBJ-NEXT:   MaxStackSize: 0x0
-; OBJ-NEXT:   PrologSize: 0x3
+; OBJ-NEXT:   PrologSize: 0x1
 ; OBJ-NEXT:   SavedRegsSize: 0x10
 ; OBJ-NEXT:   Flags [ (0x0)
 ; OBJ-NEXT:   ]
@@ -557,9 +557,9 @@ entry:
 ; OBJ-NEXT:   ]
 ; OBJ-NEXT: }
 ; OBJ-NEXT: FrameData {
-; OBJ-NEXT:   RvaStart: 0x7
-; OBJ-NEXT:   CodeSize: 0x53
-; OBJ-NEXT:   LocalSize: 0x8
+; OBJ-NEXT:   RvaStart: 0x5
+; OBJ-NEXT:   CodeSize: 0x4E
+; OBJ-NEXT:   LocalSize: 0x4
 ; OBJ-NEXT:   ParamsSize: 0x0
 ; OBJ-NEXT:   MaxStackSize: 0x0
 ; OBJ-NEXT:   PrologSize: 0x0



More information about the llvm-commits mailing list