[llvm] [TRI] Remove reserved registers in getRegPressureSetLimit (PR #118787)
Pengcheng Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 02:59:52 PST 2024
https://github.com/wangpc-pp created https://github.com/llvm/llvm-project/pull/118787
There are two `getRegPressureSetLimit`:
1. `RegisterClassInfo::getRegPressureSetLimit`.
2. `TargetRegisterInfo::getRegPressureSetLimit`.
`RegisterClassInfo::getRegPressureSetLimit` is a wrapper of
`TargetRegisterInfo::getRegPressureSetLimit` with some logics to
adjust the limit by removing reserved registers.
It seems that we shouldn't use `TargetRegisterInfo::getRegPressureSetLimit`
directly, just like the comment "This limit must be adjusted
dynamically for reserved registers" said.
However, there exists some passes that use it directly. For example,
`MachineLICM`, `MachineSink`, `MachinePipeliner`, etc. And in these
passes, the register pressure set limits are not adjusted for reserved
registers, which means that the limits are larger than the actual.
These two `getRegPressureSetLimit`s are messy, and easy to confuse
the users. So here we move the logic of adjusting these limits for
reserved registers in `RegisterClassInfo::getRegPressureSetLimit`
to `TargetRegisterInfo::getRegPressureSetLimit`. This makes the previous
one a thin cached wrapper of the later one.
This change helps to reduce the number of spills/reloads as well.
Here are the RISC-V's statistics of spills/reloads on llvm-test-suite
with `-O3 -march=rva23u64`:
```
Metric: regalloc.NumSpills,regalloc.NumReloads
Program regalloc.NumSpills regalloc.NumReloads
baseline after diff baseline after diff
External/S...T2017speed/602.gcc_s/602.gcc_s 11811.00 11349.00 -462.00 26812.00 25793.00 -1019.00
External/S...NT2017rate/502.gcc_r/502.gcc_r 11811.00 11349.00 -462.00 26812.00 25793.00 -1019.00
External/S...te/526.blender_r/526.blender_r 13513.00 13251.00 -262.00 27462.00 27195.00 -267.00
SingleSour...nchmarks/Adobe-C++/loop_unroll 1533.00 1413.00 -120.00 2943.00 2633.00 -310.00
External/S...00.perlbench_s/600.perlbench_s 4398.00 4280.00 -118.00 9745.00 9466.00 -279.00
External/S...00.perlbench_r/500.perlbench_r 4398.00 4280.00 -118.00 9745.00 9466.00 -279.00
External/S...rate/510.parest_r/510.parest_r 43985.00 43888.00 -97.00 87407.00 87330.00 -77.00
MultiSourc...sumer-typeset/consumer-typeset 1222.00 1129.00 -93.00 3048.00 2887.00 -161.00
External/S...ed/638.imagick_s/638.imagick_s 4155.00 4064.00 -91.00 10556.00 10463.00 -93.00
External/S...te/538.imagick_r/538.imagick_r 4155.00 4064.00 -91.00 10556.00 10463.00 -93.00
External/S...rate/511.povray_r/511.povray_r 1734.00 1657.00 -77.00 3410.00 3290.00 -120.00
MultiSourc...e/Applications/ClamAV/clamscan 2120.00 2049.00 -71.00 5041.00 4994.00 -47.00
External/S...23.xalancbmk_s/623.xalancbmk_s 1664.00 1608.00 -56.00 2758.00 2663.00 -95.00
External/S...23.xalancbmk_r/523.xalancbmk_r 1664.00 1608.00 -56.00 2758.00 2663.00 -95.00
MultiSource/Applications/SPASS/SPASS 1442.00 1388.00 -54.00 2954.00 2849.00 -105.00
regalloc.NumSpills regalloc.NumReloads
run baseline after diff baseline after diff
mean 86.864054 85.415094 -1.448960 1173.354136 170.657475 -2.69666
```
>From 2ec06c7094cff66d31d05e07565cdfa937b03494 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Thu, 5 Dec 2024 18:59:26 +0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
=?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.6-beta.1
---
llvm/include/llvm/CodeGen/RegisterClassInfo.h | 7 +-
.../include/llvm/CodeGen/TargetRegisterInfo.h | 9 +-
llvm/lib/CodeGen/MachinePipeliner.cpp | 41 -
llvm/lib/CodeGen/RegisterClassInfo.cpp | 37 -
llvm/lib/CodeGen/TargetRegisterInfo.cpp | 44 +
llvm/test/CodeGen/LoongArch/jr-without-ra.ll | 112 +--
llvm/test/CodeGen/NVPTX/misched_func_call.ll | 7 +-
llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir | 1 -
llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir | 1 -
.../CodeGen/PowerPC/compute-regpressure.ll | 4 +-
.../RISCV/rvv/vxrm-insert-out-of-loop.ll | 5 +-
.../test/CodeGen/Thumb2/mve-blockplacement.ll | 124 ++-
.../CodeGen/Thumb2/mve-gather-increment.ll | 788 +++++++++---------
.../Thumb2/mve-gather-scatter-optimisation.ll | 140 ++--
llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll | 75 +-
llvm/test/CodeGen/X86/avx512-regcall-Mask.ll | 4 +-
.../test/CodeGen/X86/avx512-regcall-NoMask.ll | 8 +-
llvm/test/CodeGen/X86/sse-regcall.ll | 8 +-
llvm/test/CodeGen/X86/sse-regcall4.ll | 8 +-
.../subvectorwise-store-of-vector-splat.ll | 335 ++++----
...unfold-masked-merge-vector-variablemask.ll | 556 ++++++------
.../CodeGen/X86/x86-64-flags-intrinsics.ll | 16 +-
llvm/test/TableGen/bare-minimum-psets.td | 2 +-
llvm/test/TableGen/inhibit-pset.td | 2 +-
llvm/unittests/CodeGen/MFCommon.inc | 4 +-
llvm/utils/TableGen/RegisterInfoEmitter.cpp | 7 +-
26 files changed, 1155 insertions(+), 1190 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/RegisterClassInfo.h b/llvm/include/llvm/CodeGen/RegisterClassInfo.h
index 800bebea0dddb0..417a1e40d02b95 100644
--- a/llvm/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/llvm/include/llvm/CodeGen/RegisterClassInfo.h
@@ -141,16 +141,11 @@ class RegisterClassInfo {
}
/// Get the register unit limit for the given pressure set index.
- ///
- /// RegisterClassInfo adjusts this limit for reserved registers.
unsigned getRegPressureSetLimit(unsigned Idx) const {
if (!PSetLimits[Idx])
- PSetLimits[Idx] = computePSetLimit(Idx);
+ PSetLimits[Idx] = TRI->getRegPressureSetLimit(*MF, Idx);
return PSetLimits[Idx];
}
-
-protected:
- unsigned computePSetLimit(unsigned Idx) const;
};
} // end namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 292fa3c94969be..f7cd7cfe1aa15b 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -913,9 +913,14 @@ class TargetRegisterInfo : public MCRegisterInfo {
virtual const char *getRegPressureSetName(unsigned Idx) const = 0;
/// Get the register unit pressure limit for this dimension.
- /// This limit must be adjusted dynamically for reserved registers.
+ /// TargetRegisterInfo adjusts this limit for reserved registers.
virtual unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const = 0;
+ unsigned Idx) const;
+
+ /// Get the raw register unit pressure limit for this dimension.
+ /// This limit must be adjusted dynamically for reserved registers.
+ virtual unsigned getRawRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const = 0;
/// Get the dimensions of register pressure impacted by this register class.
/// Returns a -1 terminated array of pressure set IDs.
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 7a10bd39e2695d..3ee0ba1fea5079 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1327,47 +1327,6 @@ class HighRegisterPressureDetector {
void computePressureSetLimit(const RegisterClassInfo &RCI) {
for (unsigned PSet = 0; PSet < PSetNum; PSet++)
PressureSetLimit[PSet] = TRI->getRegPressureSetLimit(MF, PSet);
-
- // We assume fixed registers, such as stack pointer, are already in use.
- // Therefore subtracting the weight of the fixed registers from the limit of
- // each pressure set in advance.
- SmallDenseSet<Register, 8> FixedRegs;
- for (const TargetRegisterClass *TRC : TRI->regclasses()) {
- for (const MCPhysReg Reg : *TRC)
- if (isFixedRegister(Reg))
- FixedRegs.insert(Reg);
- }
-
- LLVM_DEBUG({
- for (auto Reg : FixedRegs) {
- dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
- for (MCRegUnit Unit : TRI->regunits(Reg)) {
- const int *Sets = TRI->getRegUnitPressureSets(Unit);
- for (; *Sets != -1; Sets++) {
- dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
- }
- }
- dbgs() << "]\n";
- }
- });
-
- for (auto Reg : FixedRegs) {
- LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
- << "\n");
- for (MCRegUnit Unit : TRI->regunits(Reg)) {
- auto PSetIter = MRI.getPressureSets(Unit);
- unsigned Weight = PSetIter.getWeight();
- for (; PSetIter.isValid(); ++PSetIter) {
- unsigned &Limit = PressureSetLimit[*PSetIter];
- assert(
- Limit >= Weight &&
- "register pressure limit must be greater than or equal weight");
- Limit -= Weight;
- LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
- << " (decreased by " << Weight << ")\n");
- }
- }
- }
}
// There are two patterns of last-use.
diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp
index 9312bc03bc522a..976d41a54da56f 100644
--- a/llvm/lib/CodeGen/RegisterClassInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -195,40 +195,3 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
// RCI is now up-to-date.
RCI.Tag = Tag;
}
-
-/// This is not accurate because two overlapping register sets may have some
-/// nonoverlapping reserved registers. However, computing the allocation order
-/// for all register classes would be too expensive.
-unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
- const TargetRegisterClass *RC = nullptr;
- unsigned NumRCUnits = 0;
- for (const TargetRegisterClass *C : TRI->regclasses()) {
- const int *PSetID = TRI->getRegClassPressureSets(C);
- for (; *PSetID != -1; ++PSetID) {
- if ((unsigned)*PSetID == Idx)
- break;
- }
- if (*PSetID == -1)
- continue;
-
- // Found a register class that counts against this pressure set.
- // For efficiency, only compute the set order for the largest set.
- unsigned NUnits = TRI->getRegClassWeight(C).WeightLimit;
- if (!RC || NUnits > NumRCUnits) {
- RC = C;
- NumRCUnits = NUnits;
- }
- }
- assert(RC && "Failed to find register class");
- compute(RC);
- unsigned NAllocatableRegs = getNumAllocatableRegs(RC);
- unsigned RegPressureSetLimit = TRI->getRegPressureSetLimit(*MF, Idx);
- // If all the regs are reserved, return raw RegPressureSetLimit.
- // One example is VRSAVERC in PowerPC.
- // Avoid returning zero, getRegPressureSetLimit(Idx) assumes computePSetLimit
- // return non-zero value.
- if (NAllocatableRegs == 0)
- return RegPressureSetLimit;
- unsigned NReserved = RC->getNumRegs() - NAllocatableRegs;
- return RegPressureSetLimit - TRI->getRegClassWeight(RC).RegWeight * NReserved;
-}
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index 032f1a33e75c43..4cede283a7232c 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -674,6 +674,50 @@ TargetRegisterInfo::prependOffsetExpression(const DIExpression *Expr,
PrependFlags & DIExpression::EntryValue);
}
+unsigned TargetRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const {
+ const TargetRegisterClass *RC = nullptr;
+ unsigned NumRCUnits = 0;
+ for (const TargetRegisterClass *C : regclasses()) {
+ const int *PSetID = getRegClassPressureSets(C);
+ for (; *PSetID != -1; ++PSetID) {
+ if ((unsigned)*PSetID == Idx)
+ break;
+ }
+ if (*PSetID == -1)
+ continue;
+
+ // Found a register class that counts against this pressure set.
+ // For efficiency, only compute the set order for the largest set.
+ unsigned NUnits = getRegClassWeight(C).WeightLimit;
+ if (!RC || NUnits > NumRCUnits) {
+ RC = C;
+ NumRCUnits = NUnits;
+ }
+ }
+ assert(RC && "Failed to find register class");
+
+ unsigned NReserved = 0;
+ const BitVector Reserved = MF.getRegInfo().getReservedRegs();
+ for (unsigned PhysReg : RC->getRawAllocationOrder(MF))
+ if (Reserved.test(PhysReg))
+ NReserved++;
+
+ unsigned NAllocatableRegs = RC->getNumRegs() - NReserved;
+ unsigned RegPressureSetLimit = getRawRegPressureSetLimit(MF, Idx);
+ // If all the regs are reserved, return raw RegPressureSetLimit.
+ // One example is VRSAVERC in PowerPC.
+ // Avoid returning zero, RegisterClassInfo::getRegPressureSetLimit(Idx)
+ // assumes this returns non-zero value.
+ if (NAllocatableRegs == 0) {
+ LLVM_DEBUG({
+ dbgs() << "All registers of " << getRegClassName(RC) << " are reserved!";
+ });
+ return RegPressureSetLimit;
+ }
+ return RegPressureSetLimit - getRegClassWeight(RC).RegWeight * NReserved;
+}
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
index d1c4459aaa6ee0..2bd89dacb2b37a 100644
--- a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
+++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll
@@ -20,101 +20,101 @@ define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i
; CHECK-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill
; CHECK-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill
; CHECK-NEXT: st.d $s8, $sp, 8 # 8-byte Folded Spill
-; CHECK-NEXT: move $s7, $zero
-; CHECK-NEXT: move $s0, $zero
+; CHECK-NEXT: move $s6, $zero
+; CHECK-NEXT: move $s1, $zero
; CHECK-NEXT: ld.d $t0, $sp, 184
-; CHECK-NEXT: ld.d $s2, $sp, 176
-; CHECK-NEXT: ld.d $s1, $sp, 168
-; CHECK-NEXT: ld.d $t1, $sp, 160
-; CHECK-NEXT: ld.d $t2, $sp, 152
-; CHECK-NEXT: ld.d $t3, $sp, 144
-; CHECK-NEXT: ld.d $t4, $sp, 136
-; CHECK-NEXT: ld.d $t5, $sp, 128
-; CHECK-NEXT: ld.d $t6, $sp, 120
-; CHECK-NEXT: ld.d $t7, $sp, 112
-; CHECK-NEXT: ld.d $t8, $sp, 104
-; CHECK-NEXT: ld.d $fp, $sp, 96
+; CHECK-NEXT: ld.d $t1, $sp, 176
+; CHECK-NEXT: ld.d $s2, $sp, 168
+; CHECK-NEXT: ld.d $t2, $sp, 160
+; CHECK-NEXT: ld.d $t3, $sp, 152
+; CHECK-NEXT: ld.d $t4, $sp, 144
+; CHECK-NEXT: ld.d $t5, $sp, 136
+; CHECK-NEXT: ld.d $t6, $sp, 128
+; CHECK-NEXT: ld.d $t7, $sp, 120
+; CHECK-NEXT: ld.d $t8, $sp, 112
+; CHECK-NEXT: ld.d $fp, $sp, 104
+; CHECK-NEXT: ld.d $s0, $sp, 96
; CHECK-NEXT: andi $a4, $a4, 1
-; CHECK-NEXT: alsl.d $a6, $a6, $s1, 4
-; CHECK-NEXT: pcalau12i $s1, %pc_hi20(.LJTI0_0)
-; CHECK-NEXT: addi.d $s1, $s1, %pc_lo12(.LJTI0_0)
-; CHECK-NEXT: slli.d $s3, $s2, 2
-; CHECK-NEXT: alsl.d $s2, $s2, $s3, 1
-; CHECK-NEXT: add.d $s2, $t5, $s2
-; CHECK-NEXT: addi.w $s4, $zero, -41
+; CHECK-NEXT: alsl.d $a6, $a6, $s2, 4
+; CHECK-NEXT: pcalau12i $s2, %pc_hi20(.LJTI0_0)
+; CHECK-NEXT: addi.d $s2, $s2, %pc_lo12(.LJTI0_0)
; CHECK-NEXT: ori $s3, $zero, 1
-; CHECK-NEXT: slli.d $s4, $s4, 3
-; CHECK-NEXT: ori $s6, $zero, 3
-; CHECK-NEXT: lu32i.d $s6, 262144
+; CHECK-NEXT: ori $s4, $zero, 50
+; CHECK-NEXT: ori $s5, $zero, 3
+; CHECK-NEXT: lu32i.d $s5, 262144
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .p2align 4, , 16
; CHECK-NEXT: .LBB0_1: # %sw.bb27.i.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: ori $s8, $zero, 1
+; CHECK-NEXT: ori $s7, $zero, 1
; CHECK-NEXT: .LBB0_2: # %if.else.i106
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: alsl.d $s5, $s0, $s0, 3
-; CHECK-NEXT: alsl.d $s0, $s5, $s0, 1
-; CHECK-NEXT: add.d $s0, $t0, $s0
-; CHECK-NEXT: ldx.bu $s8, $s0, $s8
+; CHECK-NEXT: alsl.d $s8, $s1, $s1, 3
+; CHECK-NEXT: alsl.d $s1, $s8, $s1, 1
+; CHECK-NEXT: add.d $s1, $t0, $s1
+; CHECK-NEXT: ldx.bu $s7, $s1, $s7
; CHECK-NEXT: .LBB0_3: # %phy_tssi_get_ofdm_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: st.b $zero, $t5, 0
-; CHECK-NEXT: st.b $s7, $t3, 0
-; CHECK-NEXT: st.b $zero, $t8, 0
-; CHECK-NEXT: st.b $zero, $t1, 0
-; CHECK-NEXT: st.b $zero, $a1, 0
+; CHECK-NEXT: st.b $zero, $t6, 0
+; CHECK-NEXT: st.b $s6, $t4, 0
+; CHECK-NEXT: st.b $zero, $fp, 0
; CHECK-NEXT: st.b $zero, $t2, 0
-; CHECK-NEXT: st.b $s8, $a5, 0
-; CHECK-NEXT: ori $s0, $zero, 1
-; CHECK-NEXT: move $s7, $a3
+; CHECK-NEXT: st.b $zero, $a1, 0
+; CHECK-NEXT: st.b $zero, $t3, 0
+; CHECK-NEXT: st.b $s7, $a5, 0
+; CHECK-NEXT: ori $s1, $zero, 1
+; CHECK-NEXT: move $s6, $a3
; CHECK-NEXT: .LBB0_4: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: beqz $a4, .LBB0_9
; CHECK-NEXT: # %bb.5: # %calc_6g.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: move $s7, $zero
+; CHECK-NEXT: move $s6, $zero
; CHECK-NEXT: bnez $zero, .LBB0_8
; CHECK-NEXT: # %bb.6: # %calc_6g.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: slli.d $s8, $zero, 3
-; CHECK-NEXT: ldx.d $s8, $s8, $s1
-; CHECK-NEXT: jr $s8
+; CHECK-NEXT: slli.d $s7, $zero, 3
+; CHECK-NEXT: ldx.d $s7, $s7, $s2
+; CHECK-NEXT: jr $s7
; CHECK-NEXT: .LBB0_7: # %sw.bb12.i.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: ori $s7, $zero, 1
+; CHECK-NEXT: ori $s6, $zero, 1
; CHECK-NEXT: .LBB0_8: # %if.else58.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: ldx.bu $s7, $a6, $s7
+; CHECK-NEXT: ldx.bu $s6, $a6, $s6
; CHECK-NEXT: b .LBB0_11
; CHECK-NEXT: .p2align 4, , 16
; CHECK-NEXT: .LBB0_9: # %if.end.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: andi $s7, $s7, 255
-; CHECK-NEXT: ori $s5, $zero, 50
-; CHECK-NEXT: bltu $s5, $s7, .LBB0_15
+; CHECK-NEXT: andi $s6, $s6, 255
+; CHECK-NEXT: bltu $s4, $s6, .LBB0_15
; CHECK-NEXT: # %bb.10: # %if.end.i
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: sll.d $s7, $s3, $s7
-; CHECK-NEXT: and $s8, $s7, $s6
-; CHECK-NEXT: move $s7, $fp
-; CHECK-NEXT: beqz $s8, .LBB0_15
+; CHECK-NEXT: sll.d $s6, $s3, $s6
+; CHECK-NEXT: and $s7, $s6, $s5
+; CHECK-NEXT: move $s6, $s0
+; CHECK-NEXT: beqz $s7, .LBB0_15
; CHECK-NEXT: .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: move $s8, $zero
-; CHECK-NEXT: st.b $zero, $t7, 0
-; CHECK-NEXT: ldx.b $ra, $s2, $t4
+; CHECK-NEXT: move $s7, $zero
+; CHECK-NEXT: st.b $zero, $t8, 0
+; CHECK-NEXT: slli.d $s8, $t1, 2
+; CHECK-NEXT: alsl.d $s8, $t1, $s8, 1
+; CHECK-NEXT: add.d $s8, $t6, $s8
+; CHECK-NEXT: ldx.b $s8, $s8, $t5
; CHECK-NEXT: st.b $zero, $a2, 0
; CHECK-NEXT: st.b $zero, $a7, 0
-; CHECK-NEXT: st.b $zero, $t6, 0
-; CHECK-NEXT: st.b $ra, $a0, 0
+; CHECK-NEXT: st.b $zero, $t7, 0
+; CHECK-NEXT: st.b $s8, $a0, 0
; CHECK-NEXT: bnez $s3, .LBB0_13
; CHECK-NEXT: # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT: addi.w $s8, $zero, -41
+; CHECK-NEXT: slli.d $s8, $s8, 3
; CHECK-NEXT: pcalau12i $ra, %pc_hi20(.LJTI0_1)
; CHECK-NEXT: addi.d $ra, $ra, %pc_lo12(.LJTI0_1)
-; CHECK-NEXT: ldx.d $s5, $s4, $ra
-; CHECK-NEXT: jr $s5
+; CHECK-NEXT: ldx.d $s8, $s8, $ra
+; CHECK-NEXT: jr $s8
; CHECK-NEXT: .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit
; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1
; CHECK-NEXT: bnez $s3, .LBB0_1
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index e036753ce90306..ee6b5869111c6f 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -17,7 +17,6 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0];
; CHECK-NEXT: mov.b32 %r10, 0;
; CHECK-NEXT: mov.u64 %rd1, 0;
-; CHECK-NEXT: mov.b32 %r6, 1;
; CHECK-NEXT: $L__BB0_1: // %bb3
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: { // callseq 0, 0
@@ -29,16 +28,16 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
+; CHECK-NEXT: ld.param.f64 %fd1, [retval0];
+; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3;
; CHECK-NEXT: or.b32 %r8, %r4, %r7;
; CHECK-NEXT: mul.lo.s32 %r9, %r2, %r8;
; CHECK-NEXT: cvt.rn.f64.s32 %fd3, %r9;
-; CHECK-NEXT: ld.param.f64 %fd1, [retval0];
-; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10;
; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3;
; CHECK-NEXT: st.global.f64 [%rd1], %fd5;
-; CHECK-NEXT: mov.u32 %r10, %r6;
+; CHECK-NEXT: mov.b32 %r10, 1;
; CHECK-NEXT: bra.uni $L__BB0_1;
bb:
br label %bb3
diff --git a/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir
index fba410dc0dafce..7c8a5848b402f4 100644
--- a/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir
+++ b/llvm/test/CodeGen/PowerPC/aix-csr-alloc.mir
@@ -17,5 +17,4 @@ body: |
...
# CHECK-DAG: AllocationOrder(GPRC) = [ $r3 $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 $r12 $r0 $r31 $r30 $r29 $r28 $r27 $r26 $r25 $r24 $r23 $r22 $r21 $r20 $r19 $r18 $r17 $r16 $r15 $r14 $r13 ]
-# CHECK-DAG: AllocationOrder(F4RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ]
# CHECK-DAG: AllocationOrder(GPRC_and_GPRC_NOR0) = [ $r3 $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 $r12 $r31 $r30 $r29 $r28 $r27 $r26 $r25 $r24 $r23 $r22 $r21 $r20 $r19 $r18 $r17 $r16 $r15 $r14 $r13 ]
diff --git a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir
index 584b6b0ad46dd9..3617b95b2a6af7 100644
--- a/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir
+++ b/llvm/test/CodeGen/PowerPC/aix64-csr-alloc.mir
@@ -16,6 +16,5 @@ body: |
$f1 = COPY %2
BLR8 implicit $lr8, implicit undef $rm, implicit $x3, implicit $f1
...
-# CHECK-DAG: AllocationOrder(VFRC) = [ $vf2 $vf3 $vf4 $vf5 $vf0 $vf1 $vf6 $vf7 $vf8 $vf9 $vf10 $vf11 $vf12 $vf13 $vf14 $vf15 $vf16 $vf17 $vf18 $vf19 $vf31 $vf30 $vf29 $vf28 $vf27 $vf26 $vf25 $vf24 $vf23 $vf22 $vf21 $vf20 ]
# CHECK-DAG: AllocationOrder(G8RC_and_G8RC_NOX0) = [ $x3 $x4 $x5 $x6 $x7 $x8 $x9 $x10 $x11 $x12 $x2 $x31 $x30 $x29 $x28 $x27 $x26 $x25 $x24 $x23 $x22 $x21 $x20 $x19 $x18 $x17 $x16 $x15 $x14 ]
# CHECK-DAG: AllocationOrder(F8RC) = [ $f0 $f1 $f2 $f3 $f4 $f5 $f6 $f7 $f8 $f9 $f10 $f11 $f12 $f13 $f31 $f30 $f29 $f28 $f27 $f26 $f25 $f24 $f23 $f22 $f21 $f20 $f19 $f18 $f17 $f16 $f15 $f14 ]
diff --git a/llvm/test/CodeGen/PowerPC/compute-regpressure.ll b/llvm/test/CodeGen/PowerPC/compute-regpressure.ll
index 9a1b057c2e38d4..9d893b8dbebee2 100644
--- a/llvm/test/CodeGen/PowerPC/compute-regpressure.ll
+++ b/llvm/test/CodeGen/PowerPC/compute-regpressure.ll
@@ -1,7 +1,7 @@
; REQUIRES: asserts
-; RUN: llc -debug-only=regalloc < %s 2>&1 |FileCheck %s --check-prefix=DEBUG
+; RUN: llc -debug-only=target-reg-info < %s 2>&1 |FileCheck %s --check-prefix=DEBUG
-; DEBUG-COUNT-1: AllocationOrder(VRSAVERC) = [ ]
+; DEBUG-COUNT-1: All registers of VRSAVERC are reserved!
target triple = "powerpc64le-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
index c35f05be304cce..ec2448cb3965f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll
@@ -489,8 +489,9 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_
; RV64-NEXT: j .LBB0_11
; RV64-NEXT: .LBB0_8: # %vector.ph
; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1
-; RV64-NEXT: slli t6, t0, 28
-; RV64-NEXT: sub t6, t6, t1
+; RV64-NEXT: slli t6, t0, 1
+; RV64-NEXT: slli s0, t0, 28
+; RV64-NEXT: sub t6, s0, t6
; RV64-NEXT: and t6, t6, a6
; RV64-NEXT: csrwi vxrm, 0
; RV64-NEXT: mv s0, a2
diff --git a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
index 7087041e8dace6..6d082802f9cd75 100644
--- a/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-blockplacement.ll
@@ -353,8 +353,8 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: mov lr, r0
@@ -364,50 +364,48 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ %bb.1: @ %for.cond2.preheader.lr.ph
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: csel r7, r2, r0, lt
+; CHECK-NEXT: csel r3, r2, r0, lt
; CHECK-NEXT: mov r12, r1
-; CHECK-NEXT: mov r1, r7
-; CHECK-NEXT: cmp r7, #3
+; CHECK-NEXT: mov r1, r3
+; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: it ls
; CHECK-NEXT: movls r1, #3
; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: subs r1, r1, r7
+; CHECK-NEXT: subs r1, r1, r3
; CHECK-NEXT: movw r2, #43691
; CHECK-NEXT: adds r1, #2
; CHECK-NEXT: movt r2, #43690
-; CHECK-NEXT: ldr r6, [sp, #128]
-; CHECK-NEXT: movw r8, :lower16:c
+; CHECK-NEXT: ldr r6, [sp, #112]
+; CHECK-NEXT: movw r9, :lower16:c
; CHECK-NEXT: umull r1, r2, r1, r2
-; CHECK-NEXT: movt r8, :upper16:c
+; CHECK-NEXT: adr.w r8, .LCPI1_1
; CHECK-NEXT: movs r1, #4
-; CHECK-NEXT: @ implicit-def: $r10
; CHECK-NEXT: @ implicit-def: $r5
; CHECK-NEXT: @ implicit-def: $r11
-; CHECK-NEXT: mov.w r9, #12
-; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: @ implicit-def: $r7
+; CHECK-NEXT: movt r9, :upper16:c
+; CHECK-NEXT: mov.w r10, #12
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: add.w r1, r1, r2, lsr #1
; CHECK-NEXT: add.w r0, r0, r2, lsr #1
-; CHECK-NEXT: bic r3, r1, #3
+; CHECK-NEXT: bic r2, r1, #3
; CHECK-NEXT: adr r1, .LCPI1_0
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: adr r1, .LCPI1_1
-; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: vdup.32 q5, r0
; CHECK-NEXT: vdup.32 q6, r0
-; CHECK-NEXT: vadd.i32 q4, q0, r7
-; CHECK-NEXT: vdup.32 q7, r0
-; CHECK-NEXT: strd r3, r7, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r2, r4, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: vadd.i32 q4, q0, r3
; CHECK-NEXT: b .LBB1_6
; CHECK-NEXT: .LBB1_2: @ %for.body6.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r11
-; CHECK-NEXT: cmn.w r11, #4
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: cmn.w r7, #4
; CHECK-NEXT: it le
; CHECK-NEXT: mvnle r0, #3
; CHECK-NEXT: movw r2, #18725
; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: movt r2, #9362
-; CHECK-NEXT: sub.w r1, r0, r11
-; CHECK-NEXT: mov r10, r3
+; CHECK-NEXT: subs r1, r0, r7
; CHECK-NEXT: umull r2, r3, r1, r2
; CHECK-NEXT: subs r2, r1, r3
; CHECK-NEXT: add.w r2, r3, r2, lsr #1
@@ -415,19 +413,18 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: lsls r3, r3, #3
; CHECK-NEXT: sub.w r2, r3, r2, lsr #2
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: add r0, r1
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup5.loopexit134.split.loop.exit139
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #7
+; CHECK-NEXT: adds r7, r0, #7
; CHECK-NEXT: .LBB1_4: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov.w r10, #0
+; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup5
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: adds r5, #2
-; CHECK-NEXT: subs.w r1, r5, lr
-; CHECK-NEXT: asr.w r0, r5, #31
+; CHECK-NEXT: add.w r11, r11, #2
+; CHECK-NEXT: subs.w r1, r11, lr
+; CHECK-NEXT: asr.w r0, r11, #31
; CHECK-NEXT: sbcs.w r0, r0, r12
; CHECK-NEXT: bge.w .LBB1_28
; CHECK-NEXT: .LBB1_6: @ %for.cond2.preheader
@@ -436,36 +433,35 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Child Loop BB1_10 Depth 2
; CHECK-NEXT: @ Child Loop BB1_12 Depth 3
; CHECK-NEXT: @ Child Loop BB1_14 Depth 3
-; CHECK-NEXT: cmp.w r11, #2
+; CHECK-NEXT: cmp r7, #2
; CHECK-NEXT: bgt .LBB1_5
; CHECK-NEXT: @ %bb.7: @ %for.body6.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: cmp r7, #5
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: cmp r0, #5
; CHECK-NEXT: bhi .LBB1_17
; CHECK-NEXT: @ %bb.8: @ %for.body6.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: ldrd r2, r3, [sp, #120]
+; CHECK-NEXT: ldrd r2, r3, [sp, #104]
; CHECK-NEXT: movs r0, #32
; CHECK-NEXT: movs r1, #0
-; CHECK-NEXT: mov r4, r6
-; CHECK-NEXT: mov r7, r12
-; CHECK-NEXT: mov r6, lr
+; CHECK-NEXT: mov r6, r12
+; CHECK-NEXT: mov r4, lr
; CHECK-NEXT: bl __aeabi_ldivmod
-; CHECK-NEXT: mov lr, r6
-; CHECK-NEXT: mov r6, r4
-; CHECK-NEXT: mov r12, r7
-; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: mov lr, r4
+; CHECK-NEXT: mov r12, r6
+; CHECK-NEXT: ldr r4, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vdup.32 q0, r2
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: ldr r6, [sp, #112]
+; CHECK-NEXT: mov r0, r7
+; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: b .LBB1_10
; CHECK-NEXT: .LBB1_9: @ %for.cond.cleanup17.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: add.w r11, r0, #7
+; CHECK-NEXT: adds r7, r0, #7
; CHECK-NEXT: cmn.w r0, #4
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov.w r5, #0
+; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: bge .LBB1_5
; CHECK-NEXT: .LBB1_10: @ %for.body6.us
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
@@ -488,13 +484,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vqadd.u32 q2, q5, r1
+; CHECK-NEXT: vldrw.u32 q2, [r8]
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vcmp.u32 hi, q7, q2
-; CHECK-NEXT: vshl.i32 q2, q1, #2
+; CHECK-NEXT: vqadd.u32 q2, q2, r1
; CHECK-NEXT: add.w r1, r1, #4
-; CHECK-NEXT: vadd.i32 q2, q2, r8
-; CHECK-NEXT: vadd.i32 q1, q1, r9
+; CHECK-NEXT: vcmp.u32 hi, q6, q2
+; CHECK-NEXT: vshl.i32 q2, q1, #2
+; CHECK-NEXT: vadd.i32 q2, q2, r9
+; CHECK-NEXT: vadd.i32 q1, q1, r10
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [q2]
; CHECK-NEXT: bne .LBB1_12
@@ -507,13 +504,14 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ Parent Loop BB1_10 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vqadd.u32 q2, q5, r1
+; CHECK-NEXT: vldrw.u32 q2, [r8]
; CHECK-NEXT: subs r2, #4
-; CHECK-NEXT: vcmp.u32 hi, q6, q2
-; CHECK-NEXT: vshl.i32 q2, q1, #2
+; CHECK-NEXT: vqadd.u32 q2, q2, r1
; CHECK-NEXT: add.w r1, r1, #4
-; CHECK-NEXT: vadd.i32 q2, q2, r8
-; CHECK-NEXT: vadd.i32 q1, q1, r9
+; CHECK-NEXT: vcmp.u32 hi, q5, q2
+; CHECK-NEXT: vshl.i32 q2, q1, #2
+; CHECK-NEXT: vadd.i32 q2, q2, r9
+; CHECK-NEXT: vadd.i32 q1, q1, r10
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [q2]
; CHECK-NEXT: bne .LBB1_14
@@ -523,7 +521,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: beq .LBB1_9
; CHECK-NEXT: @ %bb.16: @ %for.cond9.for.cond15.preheader_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB1_10 Depth=2
-; CHECK-NEXT: eor r1, r10, #1
+; CHECK-NEXT: eor r1, r5, #1
; CHECK-NEXT: lsls r1, r1, #31
; CHECK-NEXT: bne .LBB1_9
; CHECK-NEXT: b .LBB1_26
@@ -532,11 +530,11 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: cmp r6, #0
; CHECK-NEXT: beq.w .LBB1_2
; CHECK-NEXT: @ %bb.18: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: .LBB1_19: @ %for.body6.us60
; CHECK-NEXT: @ Parent Loop BB1_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: lsls.w r1, r10, #31
+; CHECK-NEXT: lsls r1, r5, #31
; CHECK-NEXT: bne .LBB1_27
; CHECK-NEXT: @ %bb.20: @ %for.cond.cleanup17.us63
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
@@ -552,19 +550,19 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: bgt .LBB1_25
; CHECK-NEXT: @ %bb.23: @ %for.cond.cleanup17.us63.3
; CHECK-NEXT: @ in Loop: Header=BB1_19 Depth=2
-; CHECK-NEXT: add.w r11, r0, #28
+; CHECK-NEXT: add.w r7, r0, #28
; CHECK-NEXT: cmn.w r0, #25
-; CHECK-NEXT: mov.w r10, #0
-; CHECK-NEXT: mov r0, r11
+; CHECK-NEXT: mov.w r5, #0
+; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: blt .LBB1_19
; CHECK-NEXT: b .LBB1_5
; CHECK-NEXT: .LBB1_24: @ %for.cond.cleanup5.loopexit134.split.loop.exit137
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #14
+; CHECK-NEXT: add.w r7, r0, #14
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_25: @ %for.cond.cleanup5.loopexit134.split.loop.exit135
; CHECK-NEXT: @ in Loop: Header=BB1_6 Depth=1
-; CHECK-NEXT: add.w r11, r0, #21
+; CHECK-NEXT: add.w r7, r0, #21
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_26: @ %for.inc19.us
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -574,7 +572,7 @@ define i32 @d(i64 %e, i32 %f, i64 %g, i32 %h) {
; CHECK-NEXT: b .LBB1_27
; CHECK-NEXT: .LBB1_28: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #16
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index 93cab25c2cb72e..aded1eb99d892a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -539,71 +539,71 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture reado
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #28
-; CHECK-NEXT: sub sp, #28
+; CHECK-NEXT: .pad #32
+; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill
; CHECK-NEXT: blt .LBB11_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: mov r9, r0
; CHECK-NEXT: movs r6, #1
-; CHECK-NEXT: add r2, sp, #12
-; CHECK-NEXT: mov.w r9, #8
+; CHECK-NEXT: mov.w r10, #8
; CHECK-NEXT: bic r1, r1, #7
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
-; CHECK-NEXT: sub.w r3, r1, #8
-; CHECK-NEXT: add.w r8, r6, r3, lsr #3
-; CHECK-NEXT: adr r3, .LCPI11_0
-; CHECK-NEXT: vldrw.u32 q0, [r3]
+; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: sub.w r7, r1, #8
+; CHECK-NEXT: add.w r0, r6, r7, lsr #3
+; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
+; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: .LBB11_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB11_3 Depth 2
-; CHECK-NEXT: dls lr, r8
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
+; CHECK-NEXT: dls lr, r1
+; CHECK-NEXT: adr r1, .LCPI11_0
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [r1]
; CHECK-NEXT: .LBB11_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vstrw.32 q1, [r2]
-; CHECK-NEXT: mov r12, r2
-; CHECK-NEXT: vldrh.s32 q2, [r2, #8]
-; CHECK-NEXT: vadd.i16 q1, q1, r9
-; CHECK-NEXT: vshl.i32 q2, q2, #1
-; CHECK-NEXT: vadd.i32 q2, q2, r0
-; CHECK-NEXT: vmov r7, r5, d5
-; CHECK-NEXT: vmov r3, r4, d4
-; CHECK-NEXT: vldrh.s32 q2, [r2]
-; CHECK-NEXT: vshl.i32 q2, q2, #1
-; CHECK-NEXT: vadd.i32 q2, q2, r0
-; CHECK-NEXT: vmov r1, r10, d5
-; CHECK-NEXT: ldrh r7, [r7]
+; CHECK-NEXT: vstrw.32 q0, [r0]
+; CHECK-NEXT: vadd.i16 q0, q0, r10
+; CHECK-NEXT: vldrh.s32 q1, [r0, #8]
+; CHECK-NEXT: vshl.i32 q1, q1, #1
+; CHECK-NEXT: vadd.i32 q1, q1, r9
+; CHECK-NEXT: vmov r3, r6, d3
+; CHECK-NEXT: vmov r5, r4, d2
+; CHECK-NEXT: vldrh.s32 q1, [r0]
+; CHECK-NEXT: vshl.i32 q1, q1, #1
+; CHECK-NEXT: vadd.i32 q1, q1, r9
+; CHECK-NEXT: vmov r12, r11, d3
+; CHECK-NEXT: ldrh.w r8, [r6]
+; CHECK-NEXT: vmov r2, r6, d2
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: ldrh.w r2, [r10]
-; CHECK-NEXT: ldrh.w r10, [r3]
-; CHECK-NEXT: vmov r3, r11, d4
-; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: ldrh.w r11, [r11]
-; CHECK-NEXT: vmov.16 q2[0], r3
-; CHECK-NEXT: vmov.16 q2[1], r11
-; CHECK-NEXT: vmov.16 q2[2], r1
-; CHECK-NEXT: vmov.16 q2[3], r2
-; CHECK-NEXT: mov r2, r12
-; CHECK-NEXT: vmov.16 q2[4], r10
-; CHECK-NEXT: vmov.16 q2[5], r4
-; CHECK-NEXT: vmov.16 q2[6], r7
-; CHECK-NEXT: vmov.16 q2[7], r5
-; CHECK-NEXT: vstrb.8 q2, [r6], #16
+; CHECK-NEXT: ldrh.w r1, [r11]
+; CHECK-NEXT: ldrh.w r11, [r5]
+; CHECK-NEXT: ldrh.w r5, [r12]
+; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r6, [r6]
+; CHECK-NEXT: vmov.16 q1[0], r2
+; CHECK-NEXT: vmov.16 q1[1], r6
+; CHECK-NEXT: vmov.16 q1[2], r5
+; CHECK-NEXT: vmov.16 q1[3], r1
+; CHECK-NEXT: vmov.16 q1[4], r11
+; CHECK-NEXT: vmov.16 q1[5], r4
+; CHECK-NEXT: vmov.16 q1[6], r3
+; CHECK-NEXT: vmov.16 q1[7], r8
+; CHECK-NEXT: vstrb.8 q1, [r7], #16
; CHECK-NEXT: le lr, .LBB11_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
-; CHECK-NEXT: cmp r3, r1
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: cmp r2, r1
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #28
+; CHECK-NEXT: add sp, #32
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
@@ -656,145 +656,144 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #136
-; CHECK-NEXT: sub sp, #136
+; CHECK-NEXT: .pad #88
+; CHECK-NEXT: sub sp, #88
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill
; CHECK-NEXT: blt.w .LBB12_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload
-; CHECK-NEXT: adr r3, .LCPI12_2
-; CHECK-NEXT: vldrw.u32 q0, [r3]
-; CHECK-NEXT: movs r2, #1
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add r4, sp, #72
+; CHECK-NEXT: add r7, sp, #40
+; CHECK-NEXT: add r5, sp, #56
; CHECK-NEXT: bic r1, r1, #7
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: subs r1, #8
-; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q2, #0x18
-; CHECK-NEXT: add.w r1, r2, r1, lsr #3
-; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill
-; CHECK-NEXT: adr r1, .LCPI12_0
-; CHECK-NEXT: adr r2, .LCPI12_1
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: add r2, sp, #120
-; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill
+; CHECK-NEXT: vmov.i16 q6, #0x18
+; CHECK-NEXT: movs r3, #1
+; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: add.w r1, r3, r1, lsr #3
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB12_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB12_3 Depth 2
-; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, sp, #104
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: dls lr, r1
-; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload
+; CHECK-NEXT: adr r1, .LCPI12_2
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: adr r1, .LCPI12_0
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: adr r1, .LCPI12_1
+; CHECK-NEXT: ldr.w r12, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: .LBB12_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vstrw.32 q5, [r2]
-; CHECK-NEXT: mov r8, r2
-; CHECK-NEXT: vldrh.s32 q0, [r2, #8]
+; CHECK-NEXT: vstrw.32 q1, [r4]
+; CHECK-NEXT: mov r1, r5
+; CHECK-NEXT: vldrh.s32 q0, [r4, #8]
+; CHECK-NEXT: mov r11, r4
+; CHECK-NEXT: mov r5, r7
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r1, r3, d0
-; CHECK-NEXT: vmov r4, r5, d1
-; CHECK-NEXT: vldrh.s32 q0, [r2]
-; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vadd.i32 q2, q0, r0
-; CHECK-NEXT: vmov r6, r2, d4
-; CHECK-NEXT: ldrh r1, [r1]
-; CHECK-NEXT: ldrh.w r12, [r4]
-; CHECK-NEXT: add r4, sp, #88
-; CHECK-NEXT: ldrh.w r11, [r5]
-; CHECK-NEXT: ldrh r3, [r3]
-; CHECK-NEXT: ldrh r5, [r6]
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: vstrw.32 q6, [r4]
+; CHECK-NEXT: vmov r2, r3, d0
+; CHECK-NEXT: vmov r6, r10, d1
; CHECK-NEXT: vldrh.s32 q0, [r4]
-; CHECK-NEXT: vmov.16 q7[0], r5
-; CHECK-NEXT: vmov.16 q7[1], r2
+; CHECK-NEXT: vshl.i32 q0, q0, #1
+; CHECK-NEXT: vadd.i32 q6, q0, r0
+; CHECK-NEXT: vmov r7, r4, d12
+; CHECK-NEXT: ldrh.w r9, [r2]
+; CHECK-NEXT: ldrh.w r2, [r10]
+; CHECK-NEXT: str r2, [sp, #36] @ 4-byte Spill
+; CHECK-NEXT: ldrh.w r8, [r3]
+; CHECK-NEXT: ldrh r3, [r6]
+; CHECK-NEXT: ldrh r2, [r7]
+; CHECK-NEXT: mov r7, r5
+; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: vstrw.32 q3, [r7]
+; CHECK-NEXT: vldrh.s32 q0, [r7]
+; CHECK-NEXT: vmov.16 q4[0], r2
+; CHECK-NEXT: vmov.16 q4[1], r4
+; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r6, r9, d0
-; CHECK-NEXT: vmov r2, r5, d1
-; CHECK-NEXT: vldrh.s32 q0, [r4, #8]
+; CHECK-NEXT: vmov r4, r6, d0
+; CHECK-NEXT: vmov r1, r2, d1
+; CHECK-NEXT: vldrh.s32 q0, [r7, #8]
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: ldrh r6, [r6]
-; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: vmov.16 q1[0], r6
-; CHECK-NEXT: ldrh.w r6, [r9]
-; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vmov.16 q1[1], r6
-; CHECK-NEXT: vmov.16 q1[2], r2
-; CHECK-NEXT: vmov r2, r6, d0
-; CHECK-NEXT: vmov.16 q1[3], r5
+; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: vmov.16 q5[0], r4
+; CHECK-NEXT: ldrh r4, [r6]
; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r6, [r6]
-; CHECK-NEXT: vmov.16 q1[4], r2
-; CHECK-NEXT: vmov r2, r5, d1
-; CHECK-NEXT: vmov.16 q1[5], r6
-; CHECK-NEXT: mov r6, r10
+; CHECK-NEXT: vmov.16 q5[1], r4
+; CHECK-NEXT: vmov.16 q5[2], r1
+; CHECK-NEXT: vmov r1, r4, d0
+; CHECK-NEXT: vmov.16 q5[3], r2
+; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: vmov.16 q5[4], r1
+; CHECK-NEXT: vmov r1, r2, d1
+; CHECK-NEXT: vmov.16 q5[5], r4
+; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vstrw.32 q4, [r10]
-; CHECK-NEXT: vldrh.s32 q0, [r6]
-; CHECK-NEXT: vmov.16 q1[6], r2
-; CHECK-NEXT: vmov.16 q1[7], r5
+; CHECK-NEXT: vstrw.32 q2, [r5]
+; CHECK-NEXT: vldrh.s32 q0, [r5]
+; CHECK-NEXT: vmov.16 q5[6], r1
+; CHECK-NEXT: vmov.16 q5[7], r2
; CHECK-NEXT: vshl.i32 q0, q0, #1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov r2, r5, d0
+; CHECK-NEXT: vmov r1, r2, d0
+; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r2, [r2]
-; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vmov.16 q3[0], r2
-; CHECK-NEXT: vmov.16 q3[1], r5
-; CHECK-NEXT: vmov r2, r5, d5
-; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
-; CHECK-NEXT: vadd.i16 q6, q6, q2
-; CHECK-NEXT: vadd.i16 q5, q5, q2
-; CHECK-NEXT: vadd.i16 q4, q4, q2
-; CHECK-NEXT: ldrh.w r9, [r2]
+; CHECK-NEXT: vmov.16 q7[0], r1
+; CHECK-NEXT: vmov.16 q7[1], r2
+; CHECK-NEXT: vmov r1, r2, d13
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vadd.i16 q3, q3, q6
+; CHECK-NEXT: vadd.i16 q1, q1, q6
+; CHECK-NEXT: vadd.i16 q2, q2, q6
+; CHECK-NEXT: ldrh.w r10, [r2]
; CHECK-NEXT: vmov r2, r4, d1
-; CHECK-NEXT: vldrh.s32 q0, [r6, #8]
-; CHECK-NEXT: ldrh r5, [r5]
-; CHECK-NEXT: vmov.16 q7[2], r9
+; CHECK-NEXT: vldrh.s32 q0, [r5, #8]
+; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: vshl.i32 q0, q0, #1
-; CHECK-NEXT: vmov.16 q7[3], r5
+; CHECK-NEXT: vmov.16 q4[2], r1
; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov.16 q7[4], r1
-; CHECK-NEXT: vmov.16 q7[5], r3
-; CHECK-NEXT: vmov.16 q7[6], r12
-; CHECK-NEXT: vmov.16 q7[7], r11
+; CHECK-NEXT: vmov.16 q4[3], r10
+; CHECK-NEXT: vmov.16 q4[4], r9
+; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: vmov.16 q4[5], r8
+; CHECK-NEXT: vmov.16 q4[6], r3
+; CHECK-NEXT: vmov.16 q4[7], r1
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q3[2], r2
-; CHECK-NEXT: vmov.16 q3[3], r4
+; CHECK-NEXT: vmov.16 q7[2], r2
+; CHECK-NEXT: vmov.16 q7[3], r4
; CHECK-NEXT: vmov r2, r4, d0
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vmov.16 q3[5], r4
+; CHECK-NEXT: vmov.16 q7[4], r2
+; CHECK-NEXT: vmov.16 q7[5], r4
; CHECK-NEXT: vmov r2, r4, d1
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r4, [r4]
-; CHECK-NEXT: vmov.16 q3[6], r2
-; CHECK-NEXT: mov r2, r8
-; CHECK-NEXT: vmov.16 q3[7], r4
-; CHECK-NEXT: vadd.i16 q0, q3, q1
-; CHECK-NEXT: vadd.i16 q0, q0, q7
-; CHECK-NEXT: vstrb.8 q0, [r7], #16
+; CHECK-NEXT: vmov.16 q7[6], r2
+; CHECK-NEXT: vmov.16 q7[7], r4
+; CHECK-NEXT: mov r4, r11
+; CHECK-NEXT: vadd.i16 q0, q7, q5
+; CHECK-NEXT: vadd.i16 q0, q0, q4
+; CHECK-NEXT: vstrb.8 q0, [r12], #16
; CHECK-NEXT: le lr, .LBB12_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload
-; CHECK-NEXT: cmp r1, r3
+; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: cmp r1, r2
; CHECK-NEXT: bne.w .LBB12_2
; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #136
+; CHECK-NEXT: add sp, #88
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -870,260 +869,246 @@ for.cond.cleanup: ; preds = %for.body, %middle.b
define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) {
; CHECK-LABEL: gather_inc_v16i8_complex:
; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: cmp r2, #1
+; CHECK-NEXT: it lt
+; CHECK-NEXT: bxlt lr
+; CHECK-NEXT: .LBB13_1: @ %vector.ph.preheader
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #312
-; CHECK-NEXT: sub sp, #312
-; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill
-; CHECK-NEXT: blt.w .LBB13_5
-; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: adr r1, .LCPI13_0
-; CHECK-NEXT: adr r6, .LCPI13_8
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: adr r1, .LCPI13_1
-; CHECK-NEXT: adr r7, .LCPI13_7
-; CHECK-NEXT: adr r3, .LCPI13_6
-; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: adr r1, .LCPI13_5
-; CHECK-NEXT: bic r10, r2, #7
-; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: adr r6, .LCPI13_9
-; CHECK-NEXT: vmov.i32 q2, #0x30
-; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r7]
-; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r3]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: .pad #160
+; CHECK-NEXT: sub sp, #160
+; CHECK-NEXT: bic lr, r2, #7
+; CHECK-NEXT: mov r12, r1
+; CHECK-NEXT: vmov.i32 q0, #0x30
; CHECK-NEXT: .LBB13_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB13_3 Depth 2
+; CHECK-NEXT: adr r1, .LCPI13_0
+; CHECK-NEXT: mov r8, r12
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: adr r1, .LCPI13_1
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: adr r1, .LCPI13_8
+; CHECK-NEXT: vldrw.u32 q4, [r1]
+; CHECK-NEXT: adr r1, .LCPI13_7
+; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: adr r1, .LCPI13_9
+; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: adr r1, .LCPI13_5
+; CHECK-NEXT: mov r9, lr
+; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r1]
+; CHECK-NEXT: adr r1, .LCPI13_6
+; CHECK-NEXT: vstrw.32 q1, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: adr r1, .LCPI13_3
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q6, [r1]
; CHECK-NEXT: adr r1, .LCPI13_4
-; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: vldrw.u32 q7, [r1]
; CHECK-NEXT: adr r1, .LCPI13_2
-; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q7, [r1]
; CHECK-NEXT: adr r1, .LCPI13_10
-; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [r1]
-; CHECK-NEXT: adr r1, .LCPI13_11
-; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q7, [r1]
-; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT: mov r11, r10
-; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: adr r1, .LCPI13_11
+; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill
; CHECK-NEXT: .LBB13_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vadd.i32 q4, q1, r0
-; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill
-; CHECK-NEXT: vmov r1, lr, d8
-; CHECK-NEXT: vadd.i32 q7, q7, r0
-; CHECK-NEXT: vmov r5, r4, d15
-; CHECK-NEXT: vadd.i32 q6, q0, r0
-; CHECK-NEXT: vmov r6, r7, d13
-; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q7, q6, r0
+; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q6, q3, r0
+; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q3, q1, r0
+; CHECK-NEXT: vmov r10, r1, d15
+; CHECK-NEXT: vmov r7, r11, d6
+; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT: vmov r5, r3, d13
+; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: vadd.i32 q2, q2, r0
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: subs.w r9, r9, #16
; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: subs.w r11, r11, #16
-; CHECK-NEXT: ldrb.w r9, [r1]
-; CHECK-NEXT: vmov r1, r3, d14
+; CHECK-NEXT: ldrb r6, [r1]
+; CHECK-NEXT: ldrb r1, [r7]
+; CHECK-NEXT: vmov r7, r4, d12
; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[0], r1
-; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[1], r1
-; CHECK-NEXT: vmov r1, r3, d12
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: vmov.8 q6[0], r7
+; CHECK-NEXT: vmov.8 q6[1], r4
+; CHECK-NEXT: vmov.8 q6[2], r5
+; CHECK-NEXT: vmov r4, r5, d14
+; CHECK-NEXT: vmov.8 q6[3], r3
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: ldrb r7, [r5]
+; CHECK-NEXT: vmov.8 q7[0], r4
+; CHECK-NEXT: ldrb.w r5, [r10]
+; CHECK-NEXT: vmov.8 q7[1], r7
+; CHECK-NEXT: ldrb.w r7, [r11]
; CHECK-NEXT: vmov.8 q7[2], r5
-; CHECK-NEXT: ldrb r5, [r6]
-; CHECK-NEXT: ldrb r6, [r4]
+; CHECK-NEXT: vmov r5, r10, d5
; CHECK-NEXT: vmov.8 q7[3], r6
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[0], r1
-; CHECK-NEXT: vmov r6, r1, d2
-; CHECK-NEXT: vmov.8 q6[1], r3
-; CHECK-NEXT: vmov.8 q6[2], r5
-; CHECK-NEXT: vmov.8 q6[3], r7
-; CHECK-NEXT: ldrb.w r7, [lr]
-; CHECK-NEXT: vmov.8 q6[4], r9
-; CHECK-NEXT: vmov.8 q6[5], r7
-; CHECK-NEXT: ldrb r4, [r1]
-; CHECK-NEXT: vmov r1, r5, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload
-; CHECK-NEXT: ldrb.w r12, [r1]
-; CHECK-NEXT: vmov r1, r3, d9
-; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[6], r1
-; CHECK-NEXT: vmov r1, r7, d0
-; CHECK-NEXT: vmov.8 q6[7], r3
-; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r7, [r7]
+; CHECK-NEXT: vmov r3, r4, d4
; CHECK-NEXT: vmov.8 q7[4], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vmov.8 q7[5], r7
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: ldrb r6, [r5]
+; CHECK-NEXT: vmov r1, r5, d7
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q3, q3, q0
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.8 q7[6], r1
-; CHECK-NEXT: ldrb r1, [r6]
-; CHECK-NEXT: vmov r7, r6, d0
-; CHECK-NEXT: vmov.8 q7[7], r3
-; CHECK-NEXT: vmov r3, lr, d1
-; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[8], r1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov.8 q7[9], r4
-; CHECK-NEXT: vmov r4, r1, d0
-; CHECK-NEXT: vmov.8 q7[10], r12
-; CHECK-NEXT: vmov.8 q7[11], r5
+; CHECK-NEXT: vmov r1, r7, d2
+; CHECK-NEXT: vmov.8 q7[7], r5
+; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: ldrb r6, [r6]
+; CHECK-NEXT: vmov.8 q6[4], r1
+; CHECK-NEXT: vmov r1, r5, d3
+; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT: vmov.8 q6[5], r7
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: vmov.8 q6[6], r1
+; CHECK-NEXT: ldrb r1, [r3]
+; CHECK-NEXT: vmov.8 q6[7], r5
+; CHECK-NEXT: vmov r3, r7, d2
+; CHECK-NEXT: vmov.8 q6[8], r1
+; CHECK-NEXT: vmov r1, r11, d3
+; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT: vmov.8 q6[9], r4
+; CHECK-NEXT: vmov.8 q6[10], r6
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r5, r6, d2
+; CHECK-NEXT: ldrb r4, [r7]
+; CHECK-NEXT: ldrb.w r7, [r10]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: vmov.8 q6[11], r7
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q6[8], r4
-; CHECK-NEXT: vmov r5, r4, d1
-; CHECK-NEXT: vmov.8 q6[9], r1
-; CHECK-NEXT: vadd.i32 q0, q5, r0
-; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: vmov.8 q6[10], r5
-; CHECK-NEXT: vmov.8 q6[11], r4
-; CHECK-NEXT: vmov.8 q6[12], r7
-; CHECK-NEXT: vmov.8 q6[13], r6
-; CHECK-NEXT: vmov.8 q6[14], r3
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: ldrb r6, [r6]
+; CHECK-NEXT: vmov.8 q7[8], r5
+; CHECK-NEXT: vmov r5, r7, d3
+; CHECK-NEXT: vmov.8 q7[9], r6
+; CHECK-NEXT: vadd.i32 q1, q2, r0
+; CHECK-NEXT: vadd.i32 q2, q2, q0
+; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q2, q2, q0
+; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: ldrb r7, [r7]
+; CHECK-NEXT: vmov.8 q7[10], r5
+; CHECK-NEXT: vmov.8 q7[11], r7
+; CHECK-NEXT: vmov.8 q7[12], r3
+; CHECK-NEXT: vmov.8 q7[13], r4
+; CHECK-NEXT: vmov.8 q7[14], r1
+; CHECK-NEXT: vmov r1, r3, d2
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[12], r1
+; CHECK-NEXT: vmov.8 q6[12], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q1, r0
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vmov.8 q6[13], r1
+; CHECK-NEXT: vmov r1, r3, d3
+; CHECK-NEXT: vadd.i32 q1, q5, r0
+; CHECK-NEXT: vadd.i32 q5, q5, q0
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[14], r1
+; CHECK-NEXT: vmov.8 q6[14], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[15], r1
-; CHECK-NEXT: ldrb.w r1, [lr]
; CHECK-NEXT: vmov.8 q6[15], r1
-; CHECK-NEXT: vmov r1, r3, d0
-; CHECK-NEXT: vadd.i8 q6, q6, q7
+; CHECK-NEXT: ldrb.w r1, [r11]
+; CHECK-NEXT: vmov.8 q7[15], r1
+; CHECK-NEXT: vmov r1, r3, d2
+; CHECK-NEXT: vadd.i8 q6, q7, q6
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov.8 q7[0], r1
; CHECK-NEXT: vmov.8 q7[1], r3
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q3, r0
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, r3, d3
+; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[2], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[3], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d2
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[4], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[5], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q5, r0
-; CHECK-NEXT: vadd.i32 q5, q5, q2
-; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q5, q5, q2
+; CHECK-NEXT: vmov r1, r3, d3
+; CHECK-NEXT: vadd.i32 q1, q4, r0
+; CHECK-NEXT: vadd.i32 q4, q4, q0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[6], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[7], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d2
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[8], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[9], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q4, r0
-; CHECK-NEXT: vadd.i32 q4, q4, q2
-; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, r3, d3
+; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[10], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[11], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d2
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[12], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vmov r1, r3, d3
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[14], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[15], r1
-; CHECK-NEXT: vadd.i8 q0, q6, q7
-; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload
-; CHECK-NEXT: vstrb.8 q0, [r8], #16
-; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q7, q7, q2
-; CHECK-NEXT: vadd.i32 q0, q0, q2
+; CHECK-NEXT: vadd.i8 q1, q6, q7
+; CHECK-NEXT: vldrw.u32 q7, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vstrb.8 q1, [r8], #16
+; CHECK-NEXT: vadd.i32 q7, q7, q0
+; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q7, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q6, q6, q0
+; CHECK-NEXT: vadd.i32 q1, q1, q0
+; CHECK-NEXT: vadd.i32 q7, q7, q0
+; CHECK-NEXT: vstrw.32 q7, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q7, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q7, q7, q0
+; CHECK-NEXT: vstrw.32 q7, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q7, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q7, q7, q0
+; CHECK-NEXT: vstrw.32 q7, [sp, #128] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q7, q7, q0
+; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill
; CHECK-NEXT: bne.w .LBB13_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1
-; CHECK-NEXT: cmp r10, r2
+; CHECK-NEXT: cmp lr, r2
; CHECK-NEXT: bne.w .LBB13_2
-; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #312
+; CHECK-NEXT: @ %bb.5:
+; CHECK-NEXT: add sp, #160
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.6:
; CHECK-NEXT: .LCPI13_0:
@@ -1232,102 +1217,95 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #64
-; CHECK-NEXT: sub sp, #64
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: cmp r2, #1
-; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill
-; CHECK-NEXT: blt.w .LBB14_5
+; CHECK-NEXT: strd r1, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT: blt .LBB14_5
; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader
-; CHECK-NEXT: adr r5, .LCPI14_3
-; CHECK-NEXT: adr r7, .LCPI14_1
-; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT: adr r3, .LCPI14_0
-; CHECK-NEXT: adr r6, .LCPI14_2
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r7]
-; CHECK-NEXT: bic r9, r1, #7
-; CHECK-NEXT: vldrw.u32 q3, [r3]
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [r6]
-; CHECK-NEXT: mov.w lr, #16
-; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: mov.w r11, #16
+; CHECK-NEXT: bic r3, r1, #7
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB14_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_3 Depth 2
-; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload
-; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vmov q4, q3
+; CHECK-NEXT: adr r1, .LCPI14_3
+; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: adr r1, .LCPI14_1
+; CHECK-NEXT: vldrw.u32 q4, [r1]
+; CHECK-NEXT: adr r1, .LCPI14_2
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: adr r1, .LCPI14_0
+; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: .LBB14_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vadd.i32 q1, q5, r0
-; CHECK-NEXT: vadd.i32 q2, q4, r0
-; CHECK-NEXT: vmov r7, r3, d3
-; CHECK-NEXT: vadd.i32 q6, q0, lr
-; CHECK-NEXT: vmov r5, r6, d5
-; CHECK-NEXT: subs.w r9, r9, #16
-; CHECK-NEXT: vmov r4, r10, d2
-; CHECK-NEXT: vadd.i32 q1, q7, lr
-; CHECK-NEXT: vadd.i32 q4, q4, lr
-; CHECK-NEXT: vadd.i32 q5, q5, lr
-; CHECK-NEXT: ldrb.w r11, [r3]
-; CHECK-NEXT: ldrb r3, [r7]
-; CHECK-NEXT: vmov r7, r12, d4
-; CHECK-NEXT: vadd.i32 q2, q7, r0
-; CHECK-NEXT: vadd.i32 q7, q0, r0
+; CHECK-NEXT: vadd.i32 q6, q1, r0
+; CHECK-NEXT: vadd.i32 q2, q0, r0
+; CHECK-NEXT: vmov r4, r5, d13
+; CHECK-NEXT: vadd.i32 q3, q5, r11
+; CHECK-NEXT: vmov lr, r8, d4
+; CHECK-NEXT: subs r3, #16
+; CHECK-NEXT: vmov r6, r12, d5
+; CHECK-NEXT: vadd.i32 q2, q4, r11
+; CHECK-NEXT: vadd.i32 q1, q1, r11
+; CHECK-NEXT: vadd.i32 q0, q0, r11
+; CHECK-NEXT: ldrb.w r10, [r5]
+; CHECK-NEXT: vmov r2, r5, d12
+; CHECK-NEXT: vadd.i32 q6, q5, r0
+; CHECK-NEXT: vadd.i32 q5, q4, r0
+; CHECK-NEXT: ldrb.w r1, [r8]
+; CHECK-NEXT: ldrb.w r9, [r4]
+; CHECK-NEXT: ldrb r4, [r6]
+; CHECK-NEXT: ldrb.w r6, [lr]
+; CHECK-NEXT: ldrb.w r12, [r12]
+; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: ldrb r6, [r6]
+; CHECK-NEXT: vmov.8 q4[0], r2
+; CHECK-NEXT: vmov.8 q4[1], r5
+; CHECK-NEXT: vmov r8, r5, d11
+; CHECK-NEXT: vmov.8 q4[2], r9
+; CHECK-NEXT: vmov.8 q4[3], r10
+; CHECK-NEXT: vmov.8 q4[4], r6
+; CHECK-NEXT: vmov.8 q4[5], r1
+; CHECK-NEXT: vmov.8 q4[6], r4
+; CHECK-NEXT: vmov r4, r6, d10
+; CHECK-NEXT: vmov.8 q4[7], r12
+; CHECK-NEXT: vmov q5, q3
+; CHECK-NEXT: ldrb.w lr, [r5]
+; CHECK-NEXT: vmov r5, r2, d13
; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb.w r10, [r10]
-; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: ldrb.w r1, [r12]
-; CHECK-NEXT: vmov.8 q0[0], r7
-; CHECK-NEXT: vmov.8 q0[1], r1
-; CHECK-NEXT: vmov r1, r7, d15
-; CHECK-NEXT: vmov.8 q0[2], r5
-; CHECK-NEXT: vmov.8 q0[3], r6
-; CHECK-NEXT: vmov.8 q0[4], r4
-; CHECK-NEXT: vmov r4, r2, d4
-; CHECK-NEXT: vmov.8 q0[5], r10
-; CHECK-NEXT: vmov.8 q0[6], r3
-; CHECK-NEXT: vmov.8 q0[7], r11
-; CHECK-NEXT: ldrb r6, [r7]
-; CHECK-NEXT: vmov r5, r7, d5
+; CHECK-NEXT: ldrb r6, [r6]
+; CHECK-NEXT: vmov.8 q4[8], r4
+; CHECK-NEXT: vmov.8 q4[9], r6
+; CHECK-NEXT: ldrb.w r9, [r2]
+; CHECK-NEXT: vmov r1, r2, d12
+; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: ldrb.w r10, [r2]
+; CHECK-NEXT: ldrb.w r2, [r8]
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: ldrb r2, [r2]
-; CHECK-NEXT: ldrb r3, [r5]
-; CHECK-NEXT: ldrb.w r12, [r7]
-; CHECK-NEXT: ldrb r5, [r4]
-; CHECK-NEXT: vmov r4, r7, d14
-; CHECK-NEXT: vmov q7, q1
-; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: vmov.8 q0[8], r4
-; CHECK-NEXT: vmov.8 q0[9], r7
-; CHECK-NEXT: vmov.8 q0[10], r1
-; CHECK-NEXT: vmov.8 q0[11], r6
-; CHECK-NEXT: vmov.8 q0[12], r5
-; CHECK-NEXT: vmov.8 q0[13], r2
-; CHECK-NEXT: vmov.8 q0[14], r3
-; CHECK-NEXT: vmov.8 q0[15], r12
-; CHECK-NEXT: vstrb.8 q0, [r8], #16
-; CHECK-NEXT: vmov q0, q6
+; CHECK-NEXT: vmov.8 q4[10], r2
+; CHECK-NEXT: vmov.8 q4[11], lr
+; CHECK-NEXT: vmov.8 q4[12], r1
+; CHECK-NEXT: vmov.8 q4[13], r10
+; CHECK-NEXT: vmov.8 q4[14], r5
+; CHECK-NEXT: vmov.8 q4[15], r9
+; CHECK-NEXT: vstrb.8 q4, [r7], #16
+; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: bne .LBB14_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload
-; CHECK-NEXT: cmp r9, r1
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: cmp r3, r1
; CHECK-NEXT: bne .LBB14_2
; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #64
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: add sp, #16
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
index 7b8b884576d13e..82ec62ec9f7a13 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll
@@ -602,60 +602,57 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
-; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill
+; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: cmp r3, #0
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: itt ne
-; CHECK-NEXT: ldrne r0, [sp, #136]
+; CHECK-NEXT: ldrne r0, [sp, #112]
; CHECK-NEXT: cmpne r0, #0
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #32
-; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader
-; CHECK-NEXT: ldr.w r12, [sp, #140]
+; CHECK-NEXT: ldr.w r12, [sp, #116]
; CHECK-NEXT: movs r7, #1
-; CHECK-NEXT: mov.w r11, #0
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: bic r2, r12, #3
-; CHECK-NEXT: subs r3, r2, #4
-; CHECK-NEXT: add.w r0, r7, r3, lsr #2
-; CHECK-NEXT: ldr r7, [sp, #136]
-; CHECK-NEXT: adr r3, .LCPI10_0
-; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: lsl.w r0, r12, #1
-; CHECK-NEXT: vdup.32 q1, r7
-; CHECK-NEXT: vldrw.u32 q2, [r3]
-; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT: lsls r6, r7, #1
-; CHECK-NEXT: vshl.i32 q3, q1, #2
-; CHECK-NEXT: movs r3, #0
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: movs r5, #0
+; CHECK-NEXT: bic r0, r12, #3
+; CHECK-NEXT: subs r3, r0, #4
+; CHECK-NEXT: add.w r3, r7, r3, lsr #2
+; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: ldr r3, [sp, #112]
+; CHECK-NEXT: lsl.w r7, r12, #1
+; CHECK-NEXT: str r7, [sp] @ 4-byte Spill
+; CHECK-NEXT: movs r7, #0
+; CHECK-NEXT: vdup.32 q1, r3
+; CHECK-NEXT: lsls r6, r3, #1
+; CHECK-NEXT: vshl.i32 q2, q1, #2
+; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: str r3, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: b .LBB10_5
; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: add.w r3, r0, r5, lsl #1
+; CHECK-NEXT: add.w r3, r2, r8, lsl #1
; CHECK-NEXT: wlstp.8 lr, r6, .LBB10_4
; CHECK-NEXT: b .LBB10_15
; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: add r11, r12
-; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add r3, r0
-; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: adds r3, #1
-; CHECK-NEXT: cmp r3, r0
+; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload
+; CHECK-NEXT: add r7, r12
+; CHECK-NEXT: ldr r5, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: add r5, r3
+; CHECK-NEXT: str r5, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: adds r5, #1
+; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: beq .LBB10_1
; CHECK-NEXT: .LBB10_5: @ %for.cond1.preheader.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -663,21 +660,22 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: @ Child Loop BB10_11 Depth 3
; CHECK-NEXT: @ Child Loop BB10_14 Depth 3
; CHECK-NEXT: @ Child Loop BB10_15 Depth 2
-; CHECK-NEXT: mul r5, r3, r7
+; CHECK-NEXT: ldr r3, [sp, #112]
; CHECK-NEXT: cmp.w r12, #0
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: mul r8, r5, r3
; CHECK-NEXT: beq .LBB10_3
; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1
-; CHECK-NEXT: mov.w r8, #0
+; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB10_8
; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: add.w r0, r8, r5
-; CHECK-NEXT: add.w r8, r8, #1
-; CHECK-NEXT: cmp r8, r7
-; CHECK-NEXT: strh.w r10, [r3, r0, lsl #1]
+; CHECK-NEXT: add.w r3, r9, r8
+; CHECK-NEXT: add.w r9, r9, #1
+; CHECK-NEXT: strh.w r10, [r2, r3, lsl #1]
+; CHECK-NEXT: ldr r3, [sp, #112]
+; CHECK-NEXT: cmp r9, r3
; CHECK-NEXT: beq .LBB10_4
; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
@@ -692,46 +690,48 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: b .LBB10_13
; CHECK-NEXT: .LBB10_10: @ %vector.ph
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: vmov q5, q1
-; CHECK-NEXT: vmov.i32 q4, #0x0
-; CHECK-NEXT: vmlas.i32 q5, q2, r8
-; CHECK-NEXT: dls lr, r0
-; CHECK-NEXT: ldr r3, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: adr r3, .LCPI10_0
+; CHECK-NEXT: vmov q4, q1
+; CHECK-NEXT: vldrw.u32 q5, [r3]
+; CHECK-NEXT: ldr r3, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: dls lr, r3
+; CHECK-NEXT: vmlas.i32 q4, q5, r9
+; CHECK-NEXT: ldr r3, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: .LBB10_11: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: vadd.i32 q6, q5, q3
-; CHECK-NEXT: vldrh.s32 q7, [r1, q5, uxtw #1]
-; CHECK-NEXT: vldrh.s32 q5, [r3], #8
-; CHECK-NEXT: vmul.i32 q5, q7, q5
-; CHECK-NEXT: vadd.i32 q4, q5, q4
-; CHECK-NEXT: vmov q5, q6
+; CHECK-NEXT: vadd.i32 q5, q4, q2
+; CHECK-NEXT: vldrh.s32 q6, [r1, q4, uxtw #1]
+; CHECK-NEXT: vldrh.s32 q4, [r3], #8
+; CHECK-NEXT: vmul.i32 q4, q6, q4
+; CHECK-NEXT: vadd.i32 q3, q4, q3
+; CHECK-NEXT: vmov q4, q5
; CHECK-NEXT: le lr, .LBB10_11
; CHECK-NEXT: @ %bb.12: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: vaddv.u32 r10, q4
-; CHECK-NEXT: cmp r2, r12
-; CHECK-NEXT: mov r4, r2
+; CHECK-NEXT: vaddv.u32 r10, q3
+; CHECK-NEXT: cmp r0, r12
+; CHECK-NEXT: mov r4, r0
; CHECK-NEXT: beq .LBB10_7
; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader
; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2
-; CHECK-NEXT: mla r3, r7, r4, r8
-; CHECK-NEXT: add.w r0, r11, r4
-; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldr r3, [sp, #112]
; CHECK-NEXT: sub.w lr, r12, r4
-; CHECK-NEXT: add.w r9, r7, r0, lsl #1
-; CHECK-NEXT: ldr r7, [sp, #136]
-; CHECK-NEXT: add.w r3, r1, r3, lsl #1
+; CHECK-NEXT: ldr r5, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: mla r3, r3, r4, r9
+; CHECK-NEXT: add.w r11, r1, r3, lsl #1
+; CHECK-NEXT: adds r3, r7, r4
+; CHECK-NEXT: add.w r3, r5, r3, lsl #1
; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us
; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1
; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2
; CHECK-NEXT: @ => This Inner Loop Header: Depth=3
-; CHECK-NEXT: ldrsh.w r4, [r3]
-; CHECK-NEXT: add r3, r6
-; CHECK-NEXT: ldrsh r0, [r9], #2
-; CHECK-NEXT: smlabb r10, r4, r0, r10
+; CHECK-NEXT: ldrsh.w r5, [r11]
+; CHECK-NEXT: add r11, r6
+; CHECK-NEXT: ldrsh r4, [r3], #2
+; CHECK-NEXT: smlabb r10, r5, r4, r10
; CHECK-NEXT: le lr, .LBB10_14
; CHECK-NEXT: b .LBB10_7
; CHECK-NEXT: .LBB10_15: @ Parent Loop BB10_5 Depth=1
diff --git a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
index 70957ca950d71f..ba910d62362ddb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pipelineloops.ll
@@ -17,51 +17,40 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: cmp r7, #2
; CHECK-NEXT: csel r7, r6, r5, hs
; CHECK-NEXT: add.w lr, r7, #1
-; CHECK-NEXT: mov r4, r5
-; CHECK-NEXT: vldrh.u16 q0, [r0], #32
+; CHECK-NEXT: mov r6, r5
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: movs r7, #0
; CHECK-NEXT: mov r8, r5
-; CHECK-NEXT: vldrh.u16 q1, [r1], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q0, q1
-; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
-; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
; CHECK-NEXT: vldrh.u16 q0, [r1], #32
-; CHECK-NEXT: sub.w lr, lr, #1
-; CHECK-NEXT: cmp.w lr, #0
-; CHECK-NEXT: vldrh.u16 q1, [r0], #32
-; CHECK-NEXT: beq .LBB0_3
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
-; CHECK-NEXT: vldrh.u16 q3, [r1, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
-; CHECK-NEXT: vldrh.u16 q1, [r0], #32
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q3
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0
+; CHECK-NEXT: vldrh.u16 q1, [r1, #-16]
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q1
; CHECK-NEXT: vldrh.u16 q0, [r1], #32
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q1
+; CHECK-NEXT: vldrh.u16 q1, [r0], #32
; CHECK-NEXT: le lr, .LBB0_2
-; CHECK-NEXT: .LBB0_3:
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q3
-; CHECK-NEXT: movs r6, #14
-; CHECK-NEXT: and.w r2, r6, r2, lsl #1
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q1, q0
+; CHECK-NEXT: @ %bb.3: @ %do.body
+; CHECK-NEXT: movs r4, #14
+; CHECK-NEXT: and.w r2, r4, r2, lsl #1
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q1, q0
; CHECK-NEXT: vldrh.u16 q2, [r0, #-16]
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q1, q0
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q1, q0
; CHECK-NEXT: vldrh.u16 q0, [r1, #-16]
-; CHECK-NEXT: vmlaldavax.s16 r8, r5, q2, q0
+; CHECK-NEXT: vmlaldavax.s16 r6, r5, q2, q0
; CHECK-NEXT: vctp.16 r2
-; CHECK-NEXT: vmlsldava.s16 r4, r7, q2, q0
+; CHECK-NEXT: vmlsldava.s16 r8, r7, q2, q0
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u16 q1, [r0]
; CHECK-NEXT: cmp r2, #9
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrht.u16 q0, [r1]
-; CHECK-NEXT: vmlsldavat.s16 r4, r7, q1, q0
-; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q1, q0
+; CHECK-NEXT: vmlsldavat.s16 r8, r7, q1, q0
+; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q1, q0
; CHECK-NEXT: blo .LBB0_10
; CHECK-NEXT: @ %bb.4: @ %do.body.1
; CHECK-NEXT: subs r2, #8
@@ -69,17 +58,17 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: vpstttt
; CHECK-NEXT: vldrht.u16 q0, [r0, #16]
; CHECK-NEXT: vldrht.u16 q1, [r1, #16]
-; CHECK-NEXT: vmlsldavat.s16 r4, r7, q0, q1
-; CHECK-NEXT: vmlaldavaxt.s16 r8, r5, q0, q1
+; CHECK-NEXT: vmlsldavat.s16 r8, r7, q0, q1
+; CHECK-NEXT: vmlaldavaxt.s16 r6, r5, q0, q1
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_5: @ %if.else
-; CHECK-NEXT: mov.w r4, #0
+; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: cbz r2, .LBB0_9
; CHECK-NEXT: @ %bb.6: @ %while.body14.preheader
; CHECK-NEXT: lsls r6, r2, #1
-; CHECK-NEXT: mov r5, r4
-; CHECK-NEXT: mov r7, r4
+; CHECK-NEXT: mov r5, r8
+; CHECK-NEXT: mov r7, r8
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.16 lr, r6
; CHECK-NEXT: .p2align 2
@@ -88,22 +77,22 @@ define void @arm_cmplx_dot_prod_q15(ptr noundef %pSrcA, ptr noundef %pSrcB, i32
; CHECK-NEXT: vldrh.u16 q0, [r0], #16
; CHECK-NEXT: vldrh.u16 q1, [r1], #16
; CHECK-NEXT: vmlsldava.s16 r2, r7, q0, q1
-; CHECK-NEXT: vmlaldavax.s16 r4, r5, q0, q1
+; CHECK-NEXT: vmlaldavax.s16 r8, r5, q0, q1
; CHECK-NEXT: letp lr, .LBB0_7
; CHECK-NEXT: @ %bb.8: @ %if.end.loopexit177
-; CHECK-NEXT: mov r8, r4
-; CHECK-NEXT: mov r4, r2
+; CHECK-NEXT: mov r6, r8
+; CHECK-NEXT: mov r8, r2
; CHECK-NEXT: b .LBB0_10
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB0_9:
-; CHECK-NEXT: mov r7, r4
-; CHECK-NEXT: mov.w r8, #0
-; CHECK-NEXT: mov r5, r4
+; CHECK-NEXT: mov r7, r8
+; CHECK-NEXT: movs r6, #0
+; CHECK-NEXT: mov r5, r8
; CHECK-NEXT: .LBB0_10: @ %if.end
-; CHECK-NEXT: asrl r4, r7, #6
-; CHECK-NEXT: asrl r8, r5, #6
-; CHECK-NEXT: str r4, [r3]
-; CHECK-NEXT: str.w r8, [r12]
+; CHECK-NEXT: asrl r8, r7, #6
+; CHECK-NEXT: asrl r6, r5, #6
+; CHECK-NEXT: str.w r8, [r3]
+; CHECK-NEXT: str.w r6, [r12]
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%cmp = icmp ugt i32 %numSamples, 15
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
index b3a0c7dffae117..85d302abfd1aef 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -138,6 +138,7 @@ define dso_local i64 @caller_argv64i1() #0 {
; WIN64-NEXT: movq %rax, %rcx
; WIN64-NEXT: movq %rax, %rdx
; WIN64-NEXT: movq %rax, %rdi
+; WIN64-NEXT: movq %rax, %rsi
; WIN64-NEXT: movq %rax, %r8
; WIN64-NEXT: movq %rax, %r9
; WIN64-NEXT: movq %rax, %r10
@@ -145,7 +146,6 @@ define dso_local i64 @caller_argv64i1() #0 {
; WIN64-NEXT: movq %rax, %r12
; WIN64-NEXT: movq %rax, %r14
; WIN64-NEXT: movq %rax, %r15
-; WIN64-NEXT: movq %rax, %rsi
; WIN64-NEXT: callq test_argv64i1
; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; WIN64-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -178,13 +178,13 @@ define dso_local i64 @caller_argv64i1() #0 {
; LINUXOSX64-NEXT: movq %rax, %rcx
; LINUXOSX64-NEXT: movq %rax, %rdx
; LINUXOSX64-NEXT: movq %rax, %rdi
+; LINUXOSX64-NEXT: movq %rax, %rsi
; LINUXOSX64-NEXT: movq %rax, %r8
; LINUXOSX64-NEXT: movq %rax, %r9
; LINUXOSX64-NEXT: movq %rax, %r12
; LINUXOSX64-NEXT: movq %rax, %r13
; LINUXOSX64-NEXT: movq %rax, %r14
; LINUXOSX64-NEXT: movq %rax, %r15
-; LINUXOSX64-NEXT: movq %rax, %rsi
; LINUXOSX64-NEXT: pushq %rax
; LINUXOSX64-NEXT: .cfi_adjust_cfa_offset 8
; LINUXOSX64-NEXT: pushq %rax
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 2081d201704f3a..3aed15caa4ea70 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -987,8 +987,6 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
; WIN64: # %bb.0:
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
@@ -996,7 +994,9 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: movl %edx, %ebp
; WIN64-NEXT: subl %edi, %ebp
@@ -1032,14 +1032,14 @@ define dso_local x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %
;
; LINUXOSX64-LABEL: testi32_inp:
; LINUXOSX64: # %bb.0:
-; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx
-; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX64-NEXT: # kill: def $r14d killed $r14d def $r14
; LINUXOSX64-NEXT: # kill: def $r13d killed $r13d def $r13
; LINUXOSX64-NEXT: # kill: def $r12d killed $r12d def $r12
; LINUXOSX64-NEXT: # kill: def $r9d killed $r9d def $r9
; LINUXOSX64-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX64-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX64-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX64-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX64-NEXT: leal (%rdx,%rdi), %r10d
; LINUXOSX64-NEXT: movl %edx, %r11d
; LINUXOSX64-NEXT: subl %edi, %r11d
diff --git a/llvm/test/CodeGen/X86/sse-regcall.ll b/llvm/test/CodeGen/X86/sse-regcall.ll
index 6f0293392eef2b..e014c9f8953836 100644
--- a/llvm/test/CodeGen/X86/sse-regcall.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall.ll
@@ -244,8 +244,6 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64: # %bb.0:
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r15d killed $r15d def $r15
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
@@ -253,7 +251,9 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64-NEXT: # kill: def $r10d killed $r10d def $r10
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: movl %edx, %ebp
; WIN64-NEXT: subl %edi, %ebp
@@ -289,14 +289,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
;
; LINUXOSX-LABEL: testi32_inp:
; LINUXOSX: # %bb.0:
-; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
-; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14
; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13
; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12
; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9
; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d
; LINUXOSX-NEXT: movl %edx, %r11d
; LINUXOSX-NEXT: subl %edi, %r11d
diff --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll
index c8df7a233d7e3f..f66f9d9d449420 100644
--- a/llvm/test/CodeGen/X86/sse-regcall4.ll
+++ b/llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -244,14 +244,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
; WIN64: # %bb.0:
; WIN64-NEXT: pushq %rbp
; WIN64-NEXT: pushq %rbx
-; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
-; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $r14d killed $r14d def $r14
; WIN64-NEXT: # kill: def $r12d killed $r12d def $r12
; WIN64-NEXT: # kill: def $r11d killed $r11d def $r11
; WIN64-NEXT: # kill: def $r9d killed $r9d def $r9
; WIN64-NEXT: # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT: # kill: def $esi killed $esi def $rsi
; WIN64-NEXT: # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT: # kill: def $edx killed $edx def $rdx
; WIN64-NEXT: leal (%rdx,%rdi), %ebx
; WIN64-NEXT: movl %edx, %ebp
; WIN64-NEXT: subl %edi, %ebp
@@ -288,14 +288,14 @@ define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a
;
; LINUXOSX-LABEL: testi32_inp:
; LINUXOSX: # %bb.0:
-; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
-; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $r14d killed $r14d def $r14
; LINUXOSX-NEXT: # kill: def $r13d killed $r13d def $r13
; LINUXOSX-NEXT: # kill: def $r12d killed $r12d def $r12
; LINUXOSX-NEXT: # kill: def $r9d killed $r9d def $r9
; LINUXOSX-NEXT: # kill: def $r8d killed $r8d def $r8
+; LINUXOSX-NEXT: # kill: def $esi killed $esi def $rsi
; LINUXOSX-NEXT: # kill: def $edi killed $edi def $rdi
+; LINUXOSX-NEXT: # kill: def $edx killed $edx def $rdx
; LINUXOSX-NEXT: leal (%rdx,%rdi), %r10d
; LINUXOSX-NEXT: movl %edx, %r11d
; LINUXOSX-NEXT: subl %edi, %r11d
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index f1fd05565c47e9..20d3f20c1a1492 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -389,7 +389,7 @@ define void @vec128_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec128_v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
@@ -452,7 +452,7 @@ define void @vec128_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec128_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
@@ -599,7 +599,7 @@ define void @vec128_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec128_v4i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
@@ -694,7 +694,7 @@ define void @vec128_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX512-LABEL: vec128_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
@@ -1003,7 +1003,7 @@ define void @vec256_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec256_v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1079,7 +1079,7 @@ define void @vec256_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec256_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1355,7 +1355,7 @@ define void @vec256_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec256_v4i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -1550,7 +1550,7 @@ define void @vec256_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX512-LABEL: vec256_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2170,7 +2170,7 @@ define void @vec384_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec384_v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2258,7 +2258,7 @@ define void @vec384_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec384_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -2722,7 +2722,7 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX512-LABEL: vec384_v3i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vpextrb $2, %xmm0, 2(%rsi)
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: movw %ax, (%rsi)
@@ -3006,7 +3006,7 @@ define void @vec384_v3i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec384_v3i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi)
; AVX512-NEXT: vmovd %xmm0, (%rsi)
; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx)
@@ -3664,7 +3664,7 @@ define void @vec384_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec384_v4i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -3983,7 +3983,7 @@ define void @vec384_v6i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX512-LABEL: vec384_v6i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rsi)
; AVX512-NEXT: vmovd %xmm0, (%rsi)
; AVX512-NEXT: vpextrw $2, %xmm0, 4(%rdx)
@@ -4420,7 +4420,7 @@ define void @vec384_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX512-LABEL: vec384_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
@@ -5444,7 +5444,7 @@ define void @vec512_v2i32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec512_v2i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -5540,7 +5540,7 @@ define void @vec512_v2f32(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec512_v2f32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -5965,7 +5965,7 @@ define void @vec512_v4i16(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; AVX512-LABEL: vec512_v4i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -6363,7 +6363,7 @@ define void @vec512_v8i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; AVX512-LABEL: vec512_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = ~xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vpbroadcastq %xmm0, %zmm0
; AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -7079,6 +7079,14 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: pushq %r13
; SCALAR-NEXT: pushq %r12
; SCALAR-NEXT: pushq %rbx
+; SCALAR-NEXT: movzbl 20(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 19(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 18(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 17(%rdi), %eax
+; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 16(%rdi), %eax
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 15(%rdi), %eax
@@ -7087,9 +7095,9 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 13(%rdi), %eax
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 12(%rdi), %r13d
-; SCALAR-NEXT: movzbl 11(%rdi), %eax
+; SCALAR-NEXT: movzbl 12(%rdi), %eax
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 11(%rdi), %r13d
; SCALAR-NEXT: movzbl 10(%rdi), %r12d
; SCALAR-NEXT: movzbl 9(%rdi), %r15d
; SCALAR-NEXT: movzbl 8(%rdi), %r14d
@@ -7123,55 +7131,51 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: notb %r12b
; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
-; SCALAR-NEXT: notb %r11b
-; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: notb %r13b
; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; SCALAR-NEXT: notb %r12b
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
+; SCALAR-NEXT: notb %r10b
+; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
+; SCALAR-NEXT: notb %r9b
+; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
; SCALAR-NEXT: notb %r8b
-; SCALAR-NEXT: notb {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Folded Spill
-; SCALAR-NEXT: movzbl 17(%rdi), %eax
-; SCALAR-NEXT: notb %al
-; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 18(%rdi), %eax
-; SCALAR-NEXT: notb %al
-; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 19(%rdi), %eax
+; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 21(%rdi), %eax
; SCALAR-NEXT: notb %al
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 20(%rdi), %eax
+; SCALAR-NEXT: movzbl 22(%rdi), %ebx
+; SCALAR-NEXT: notb %bl
+; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 23(%rdi), %eax
; SCALAR-NEXT: notb %al
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 21(%rdi), %ebp
+; SCALAR-NEXT: movzbl 24(%rdi), %ebp
; SCALAR-NEXT: notb %bpl
; SCALAR-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 22(%rdi), %ebx
-; SCALAR-NEXT: notb %bl
-; SCALAR-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 23(%rdi), %r10d
-; SCALAR-NEXT: notb %r10b
-; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 24(%rdi), %r9d
-; SCALAR-NEXT: notb %r9b
-; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 25(%rdi), %ecx
-; SCALAR-NEXT: notb %cl
-; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 25(%rdi), %r11d
+; SCALAR-NEXT: notb %r11b
+; SCALAR-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 26(%rdi), %r14d
; SCALAR-NEXT: notb %r14b
; SCALAR-NEXT: movb %r14b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 27(%rdi), %r15d
; SCALAR-NEXT: notb %r15b
; SCALAR-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 28(%rdi), %r12d
-; SCALAR-NEXT: notb %r12b
-; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SCALAR-NEXT: movzbl 29(%rdi), %r13d
+; SCALAR-NEXT: movzbl 28(%rdi), %r13d
; SCALAR-NEXT: notb %r13b
; SCALAR-NEXT: movb %r13b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movzbl 29(%rdi), %ecx
+; SCALAR-NEXT: notb %cl
+; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movzbl 30(%rdi), %eax
; SCALAR-NEXT: notb %al
; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
@@ -7180,57 +7184,56 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SCALAR-NEXT: movb %dil, 31(%rsi)
; SCALAR-NEXT: movb %al, 30(%rsi)
-; SCALAR-NEXT: movb %r13b, 29(%rsi)
-; SCALAR-NEXT: movb %r12b, 28(%rsi)
+; SCALAR-NEXT: movb %cl, 29(%rsi)
+; SCALAR-NEXT: movb %r13b, 28(%rsi)
; SCALAR-NEXT: movb %r15b, 27(%rsi)
; SCALAR-NEXT: movb %r14b, 26(%rsi)
-; SCALAR-NEXT: movb %cl, 25(%rsi)
-; SCALAR-NEXT: movb %r9b, 24(%rsi)
-; SCALAR-NEXT: movb %r10b, 23(%rsi)
-; SCALAR-NEXT: movb %bl, 22(%rsi)
-; SCALAR-NEXT: movb %bpl, 21(%rsi)
+; SCALAR-NEXT: movb %r11b, 25(%rsi)
+; SCALAR-NEXT: movb %bpl, 24(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
-; SCALAR-NEXT: movb %bpl, 20(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 19(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 18(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 17(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SCALAR-NEXT: movb %cl, 16(%rsi)
-; SCALAR-NEXT: movb %r8b, 15(%rsi)
-; SCALAR-NEXT: movl %r8d, %r14d
-; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SCALAR-NEXT: movb %bpl, 23(%rsi)
+; SCALAR-NEXT: movb %bl, 22(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
-; SCALAR-NEXT: movb %bl, 14(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 13(%rsi)
+; SCALAR-NEXT: movb %bl, 21(%rsi)
+; SCALAR-NEXT: movb %r8b, 20(%rsi)
+; SCALAR-NEXT: movb %r9b, 19(%rsi)
+; SCALAR-NEXT: movb %r10b, 18(%rsi)
+; SCALAR-NEXT: movb %r12b, 17(%rsi)
+; SCALAR-NEXT: movl %r12d, %r15d
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r12b, 16(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r14b, 15(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 12(%rsi)
-; SCALAR-NEXT: movb %r11b, 11(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, 10(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, 9(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, 8(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r11b, 7(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r13b, 6(%rsi)
+; SCALAR-NEXT: movb %al, 14(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 13(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 12(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 11(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 10(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r10b, 5(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r12b, 4(%rsi)
+; SCALAR-NEXT: movb %r10b, 9(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 8(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r9b, 3(%rsi)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r15b, 2(%rsi)
+; SCALAR-NEXT: movb %r9b, 7(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r13b, 6(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r8b, 1(%rsi)
+; SCALAR-NEXT: movb %r8b, 5(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 4(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SCALAR-NEXT: movb %dil, 3(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SCALAR-NEXT: movb %dil, 2(%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %dil, (%rsi)
+; SCALAR-NEXT: movb %dil, 1(%rsi)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r11b, (%rsi)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SCALAR-NEXT: movb %sil, 31(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
@@ -7247,92 +7250,92 @@ define void @vec512_v32i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.
; SCALAR-NEXT: movb %sil, 25(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SCALAR-NEXT: movb %sil, 24(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 23(%rdx)
+; SCALAR-NEXT: movb %bpl, 23(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
; SCALAR-NEXT: movb %sil, 22(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 21(%rdx)
-; SCALAR-NEXT: movb %bpl, 20(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 19(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 18(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 17(%rdx)
-; SCALAR-NEXT: movb %cl, 16(%rdx)
+; SCALAR-NEXT: movb %bl, 21(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bl, 20(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bl, 19(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bl, 18(%rdx)
+; SCALAR-NEXT: movb %r15b, 17(%rdx)
+; SCALAR-NEXT: movb %r12b, 16(%rdx)
; SCALAR-NEXT: movb %r14b, 15(%rdx)
-; SCALAR-NEXT: movb %bl, 14(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SCALAR-NEXT: movb %cl, 13(%rdx)
+; SCALAR-NEXT: movb %al, 14(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
+; SCALAR-NEXT: movb %bpl, 13(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SCALAR-NEXT: movb %al, 12(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SCALAR-NEXT: movb %sil, 11(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r12b, 11(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload
; SCALAR-NEXT: movb %bl, 10(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r14d # 1-byte Folded Reload
-; SCALAR-NEXT: movb %r14b, 9(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebp # 1-byte Folded Reload
-; SCALAR-NEXT: movb %bpl, 8(%rdx)
-; SCALAR-NEXT: movb %r11b, 7(%rdx)
+; SCALAR-NEXT: movb %r10b, 9(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SCALAR-NEXT: movb %al, 8(%rdx)
+; SCALAR-NEXT: movb %r9b, 7(%rdx)
; SCALAR-NEXT: movb %r13b, 6(%rdx)
-; SCALAR-NEXT: movb %r10b, 5(%rdx)
-; SCALAR-NEXT: movb %r12b, 4(%rdx)
+; SCALAR-NEXT: movb %r8b, 5(%rdx)
+; SCALAR-NEXT: movb %cl, 4(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload
; SCALAR-NEXT: movb %r9b, 3(%rdx)
-; SCALAR-NEXT: movb %r15b, 2(%rdx)
-; SCALAR-NEXT: movb %r8b, 1(%rdx)
-; SCALAR-NEXT: movb %dil, (%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 63(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 62(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 61(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 60(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 59(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 58(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 57(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 56(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 55(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 54(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 53(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 52(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 51(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 50(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 49(%rdx)
-; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 48(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SCALAR-NEXT: movb %r8b, 2(%rdx)
+; SCALAR-NEXT: movb %dil, 1(%rdx)
+; SCALAR-NEXT: movb %r11b, (%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 63(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 62(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 61(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 60(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 59(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 58(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 57(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 56(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 55(%rdx)
+; SCALAR-NEXT: movb %sil, 54(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 53(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 52(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 51(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 50(%rdx)
+; SCALAR-NEXT: movb %r15b, 49(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 48(%rdx)
+; SCALAR-NEXT: movb %r14b, 47(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 46(%rdx)
+; SCALAR-NEXT: movb %bpl, 45(%rdx)
+; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SCALAR-NEXT: movb %cl, 44(%rdx)
+; SCALAR-NEXT: movb %r12b, 43(%rdx)
+; SCALAR-NEXT: movb %bl, 42(%rdx)
+; SCALAR-NEXT: movb %r10b, 41(%rdx)
+; SCALAR-NEXT: movb %al, 40(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 47(%rdx)
+; SCALAR-NEXT: movb %al, 39(%rdx)
+; SCALAR-NEXT: movb %r13b, 38(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 46(%rdx)
-; SCALAR-NEXT: movb %cl, 45(%rdx)
+; SCALAR-NEXT: movb %al, 37(%rdx)
; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SCALAR-NEXT: movb %al, 44(%rdx)
-; SCALAR-NEXT: movb %sil, 43(%rdx)
-; SCALAR-NEXT: movb %bl, 42(%rdx)
-; SCALAR-NEXT: movb %r14b, 41(%rdx)
-; SCALAR-NEXT: movb %bpl, 40(%rdx)
-; SCALAR-NEXT: movb %r11b, 39(%rdx)
-; SCALAR-NEXT: movb %r13b, 38(%rdx)
-; SCALAR-NEXT: movb %r10b, 37(%rdx)
-; SCALAR-NEXT: movb %r12b, 36(%rdx)
+; SCALAR-NEXT: movb %al, 36(%rdx)
; SCALAR-NEXT: movb %r9b, 35(%rdx)
-; SCALAR-NEXT: movb %r15b, 34(%rdx)
-; SCALAR-NEXT: movb %r8b, 33(%rdx)
-; SCALAR-NEXT: movb %dil, 32(%rdx)
+; SCALAR-NEXT: movb %r8b, 34(%rdx)
+; SCALAR-NEXT: movb %dil, 33(%rdx)
+; SCALAR-NEXT: movb %r11b, 32(%rdx)
; SCALAR-NEXT: popq %rbx
; SCALAR-NEXT: popq %r12
; SCALAR-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1ca..c8aa7cf8c8f29e 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -1198,6 +1198,14 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movq %rdx, %r8
; CHECK-BASELINE-NEXT: movq %rsi, %r9
; CHECK-BASELINE-NEXT: movq %rdi, %r11
+; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax
@@ -1306,26 +1314,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: andb 15(%r10), %al
; CHECK-BASELINE-NEXT: xorb %cl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 16(%r8), %eax
-; CHECK-BASELINE-NEXT: movzbl 16(%r9), %ecx
-; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 16(%r10), %cl
-; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 17(%r8), %eax
+; CHECK-BASELINE-NEXT: movzbl 16(%r9), %eax
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-BASELINE-NEXT: xorb %cl, %al
+; CHECK-BASELINE-NEXT: andb 16(%r10), %al
+; CHECK-BASELINE-NEXT: xorb %cl, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 17(%r9), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 17(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 18(%r8), %eax
; CHECK-BASELINE-NEXT: movzbl 18(%r9), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 18(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 19(%r8), %eax
; CHECK-BASELINE-NEXT: movzbl 19(%r9), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: andb 19(%r10), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
@@ -1465,6 +1473,14 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movq %rdx, %r8
; CHECK-SSE1-NEXT: movq %rsi, %r9
; CHECK-SSE1-NEXT: movq %rdi, %r11
+; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax
@@ -1573,26 +1589,26 @@ define <32 x i8> @out_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: andb 15(%r10), %al
; CHECK-SSE1-NEXT: xorb %cl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 16(%r8), %eax
-; CHECK-SSE1-NEXT: movzbl 16(%r9), %ecx
-; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 16(%r10), %cl
-; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 17(%r8), %eax
+; CHECK-SSE1-NEXT: movzbl 16(%r9), %eax
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; CHECK-SSE1-NEXT: xorb %cl, %al
+; CHECK-SSE1-NEXT: andb 16(%r10), %al
+; CHECK-SSE1-NEXT: xorb %cl, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 17(%r9), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 17(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 18(%r8), %eax
; CHECK-SSE1-NEXT: movzbl 18(%r9), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 18(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 19(%r8), %eax
; CHECK-SSE1-NEXT: movzbl 19(%r9), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: andb 19(%r10), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
@@ -3231,10 +3247,18 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: pushq %r13
; CHECK-BASELINE-NEXT: pushq %r12
; CHECK-BASELINE-NEXT: pushq %rbx
-; CHECK-BASELINE-NEXT: movq %rcx, %r12
-; CHECK-BASELINE-NEXT: movq %rdx, %r15
+; CHECK-BASELINE-NEXT: movq %rcx, %r15
+; CHECK-BASELINE-NEXT: movq %rdx, %rbx
; CHECK-BASELINE-NEXT: movq %rsi, %r14
-; CHECK-BASELINE-NEXT: movq %rdi, %r13
+; CHECK-BASELINE-NEXT: movq %rdi, %r12
+; CHECK-BASELINE-NEXT: movzbl 19(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 18(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 17(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 16(%rdx), %eax
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 15(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 14(%rdx), %eax
@@ -3247,241 +3271,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%rdx), %eax
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r8d
-; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r9d
-; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r10d
-; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %ebp
-; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %edi
+; CHECK-BASELINE-NEXT: movzbl 9(%rdx), %r13d
+; CHECK-BASELINE-NEXT: movzbl 8(%rdx), %r10d
+; CHECK-BASELINE-NEXT: movzbl 7(%rdx), %r9d
+; CHECK-BASELINE-NEXT: movzbl 6(%rdx), %r8d
+; CHECK-BASELINE-NEXT: movzbl 5(%rdx), %ebp
; CHECK-BASELINE-NEXT: movzbl 4(%rdx), %esi
-; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %eax
-; CHECK-BASELINE-NEXT: movzbl 2(%rdx), %ecx
-; CHECK-BASELINE-NEXT: movzbl (%rdx), %r11d
-; CHECK-BASELINE-NEXT: movzbl 1(%rdx), %edx
-; CHECK-BASELINE-NEXT: movzbl (%r14), %ebx
-; CHECK-BASELINE-NEXT: xorb %r11b, %bl
-; CHECK-BASELINE-NEXT: andb (%r12), %bl
-; CHECK-BASELINE-NEXT: xorb %r11b, %bl
-; CHECK-BASELINE-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 1(%r14), %r11d
-; CHECK-BASELINE-NEXT: xorb %dl, %r11b
-; CHECK-BASELINE-NEXT: andb 1(%r12), %r11b
-; CHECK-BASELINE-NEXT: xorb %dl, %r11b
+; CHECK-BASELINE-NEXT: movzbl 3(%rdx), %edx
+; CHECK-BASELINE-NEXT: movzbl 2(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl (%rbx), %edi
+; CHECK-BASELINE-NEXT: movzbl 1(%rbx), %ecx
+; CHECK-BASELINE-NEXT: movzbl (%r14), %r11d
+; CHECK-BASELINE-NEXT: xorb %dil, %r11b
+; CHECK-BASELINE-NEXT: andb (%r15), %r11b
+; CHECK-BASELINE-NEXT: xorb %dil, %r11b
; CHECK-BASELINE-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 2(%r14), %edx
-; CHECK-BASELINE-NEXT: xorb %cl, %dl
-; CHECK-BASELINE-NEXT: andb 2(%r12), %dl
-; CHECK-BASELINE-NEXT: xorb %cl, %dl
-; CHECK-BASELINE-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 3(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl 1(%r14), %edi
+; CHECK-BASELINE-NEXT: xorb %cl, %dil
+; CHECK-BASELINE-NEXT: andb 1(%r15), %dil
+; CHECK-BASELINE-NEXT: xorb %cl, %dil
+; CHECK-BASELINE-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 2(%r14), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 3(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 2(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 3(%r14), %eax
+; CHECK-BASELINE-NEXT: xorb %dl, %al
+; CHECK-BASELINE-NEXT: andb 3(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %dl, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 4(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %sil, %al
-; CHECK-BASELINE-NEXT: andb 4(%r12), %al
+; CHECK-BASELINE-NEXT: andb 4(%r15), %al
; CHECK-BASELINE-NEXT: xorb %sil, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 5(%r14), %eax
-; CHECK-BASELINE-NEXT: xorb %dil, %al
-; CHECK-BASELINE-NEXT: andb 5(%r12), %al
-; CHECK-BASELINE-NEXT: xorb %dil, %al
-; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %bpl, %al
-; CHECK-BASELINE-NEXT: andb 6(%r12), %al
+; CHECK-BASELINE-NEXT: andb 5(%r15), %al
; CHECK-BASELINE-NEXT: xorb %bpl, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax
-; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: andb 7(%r12), %al
-; CHECK-BASELINE-NEXT: xorb %r10b, %al
+; CHECK-BASELINE-NEXT: movzbl 6(%r14), %eax
+; CHECK-BASELINE-NEXT: xorb %r8b, %al
+; CHECK-BASELINE-NEXT: andb 6(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %r8b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax
+; CHECK-BASELINE-NEXT: movzbl 7(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %r9b, %al
-; CHECK-BASELINE-NEXT: andb 8(%r12), %al
+; CHECK-BASELINE-NEXT: andb 7(%r15), %al
; CHECK-BASELINE-NEXT: xorb %r9b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-BASELINE-NEXT: movzbl 8(%r14), %eax
+; CHECK-BASELINE-NEXT: xorb %r10b, %al
+; CHECK-BASELINE-NEXT: andb 8(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %r10b, %al
+; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 9(%r14), %eax
-; CHECK-BASELINE-NEXT: xorb %r8b, %al
-; CHECK-BASELINE-NEXT: andb 9(%r12), %al
-; CHECK-BASELINE-NEXT: xorb %r8b, %al
+; CHECK-BASELINE-NEXT: xorb %r13b, %al
+; CHECK-BASELINE-NEXT: andb 9(%r15), %al
+; CHECK-BASELINE-NEXT: xorb %r13b, %al
; CHECK-BASELINE-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 10(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 10(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 10(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 11(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 11(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 11(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 12(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 12(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 12(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 13(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 13(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 13(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 14(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 14(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 14(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-BASELINE-NEXT: movzbl 15(%r14), %ecx
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 15(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 15(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 16(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 16(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 16(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 16(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 17(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 17(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 17(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 17(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 18(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 18(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 18(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 18(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 19(%r15), %eax
; CHECK-BASELINE-NEXT: movzbl 19(%r14), %ecx
+; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 19(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 19(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 20(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 20(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 20(%r14), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 20(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 20(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
; CHECK-BASELINE-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-BASELINE-NEXT: movzbl 21(%r15), %eax
-; CHECK-BASELINE-NEXT: movzbl 21(%r14), %ebp
+; CHECK-BASELINE-NEXT: movzbl 21(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 21(%r14), %r13d
+; CHECK-BASELINE-NEXT: xorb %al, %r13b
+; CHECK-BASELINE-NEXT: andb 21(%r15), %r13b
+; CHECK-BASELINE-NEXT: xorb %al, %r13b
+; CHECK-BASELINE-NEXT: movzbl 22(%rbx), %eax
+; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebp
; CHECK-BASELINE-NEXT: xorb %al, %bpl
-; CHECK-BASELINE-NEXT: andb 21(%r12), %bpl
+; CHECK-BASELINE-NEXT: andb 22(%r15), %bpl
; CHECK-BASELINE-NEXT: xorb %al, %bpl
-; CHECK-BASELINE-NEXT: movzbl 22(%r15), %eax
-; CHECK-BASELINE-NEXT: movzbl 22(%r14), %ebx
-; CHECK-BASELINE-NEXT: xorb %al, %bl
-; CHECK-BASELINE-NEXT: andb 22(%r12), %bl
-; CHECK-BASELINE-NEXT: xorb %al, %bl
-; CHECK-BASELINE-NEXT: movzbl 23(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 23(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 23(%r14), %r11d
; CHECK-BASELINE-NEXT: xorb %al, %r11b
-; CHECK-BASELINE-NEXT: andb 23(%r12), %r11b
+; CHECK-BASELINE-NEXT: andb 23(%r15), %r11b
; CHECK-BASELINE-NEXT: xorb %al, %r11b
-; CHECK-BASELINE-NEXT: movzbl 24(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 24(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 24(%r14), %r9d
; CHECK-BASELINE-NEXT: xorb %al, %r9b
-; CHECK-BASELINE-NEXT: andb 24(%r12), %r9b
+; CHECK-BASELINE-NEXT: andb 24(%r15), %r9b
; CHECK-BASELINE-NEXT: xorb %al, %r9b
-; CHECK-BASELINE-NEXT: movzbl 25(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 25(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 25(%r14), %r8d
; CHECK-BASELINE-NEXT: xorb %al, %r8b
-; CHECK-BASELINE-NEXT: andb 25(%r12), %r8b
+; CHECK-BASELINE-NEXT: andb 25(%r15), %r8b
; CHECK-BASELINE-NEXT: xorb %al, %r8b
-; CHECK-BASELINE-NEXT: movzbl 26(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 26(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 26(%r14), %edi
; CHECK-BASELINE-NEXT: xorb %al, %dil
-; CHECK-BASELINE-NEXT: andb 26(%r12), %dil
+; CHECK-BASELINE-NEXT: andb 26(%r15), %dil
; CHECK-BASELINE-NEXT: xorb %al, %dil
-; CHECK-BASELINE-NEXT: movzbl 27(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 27(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 27(%r14), %esi
; CHECK-BASELINE-NEXT: xorb %al, %sil
-; CHECK-BASELINE-NEXT: andb 27(%r12), %sil
+; CHECK-BASELINE-NEXT: andb 27(%r15), %sil
; CHECK-BASELINE-NEXT: xorb %al, %sil
-; CHECK-BASELINE-NEXT: movzbl 28(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 28(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 28(%r14), %edx
; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: andb 28(%r12), %dl
+; CHECK-BASELINE-NEXT: andb 28(%r15), %dl
; CHECK-BASELINE-NEXT: xorb %al, %dl
-; CHECK-BASELINE-NEXT: movzbl 29(%r15), %eax
+; CHECK-BASELINE-NEXT: movzbl 29(%rbx), %eax
; CHECK-BASELINE-NEXT: movzbl 29(%r14), %ecx
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: andb 29(%r12), %cl
+; CHECK-BASELINE-NEXT: andb 29(%r15), %cl
; CHECK-BASELINE-NEXT: xorb %al, %cl
-; CHECK-BASELINE-NEXT: movzbl 30(%r15), %r10d
+; CHECK-BASELINE-NEXT: movzbl 30(%rbx), %r10d
; CHECK-BASELINE-NEXT: movzbl 30(%r14), %eax
; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: andb 30(%r12), %al
+; CHECK-BASELINE-NEXT: andb 30(%r15), %al
; CHECK-BASELINE-NEXT: xorb %r10b, %al
-; CHECK-BASELINE-NEXT: movzbl 31(%r15), %r10d
-; CHECK-BASELINE-NEXT: movzbl 31(%r14), %r14d
-; CHECK-BASELINE-NEXT: xorb %r10b, %r14b
-; CHECK-BASELINE-NEXT: andb 31(%r12), %r14b
-; CHECK-BASELINE-NEXT: xorb %r10b, %r14b
-; CHECK-BASELINE-NEXT: movb %r14b, 31(%r13)
-; CHECK-BASELINE-NEXT: movb %al, 30(%r13)
-; CHECK-BASELINE-NEXT: movb %cl, 29(%r13)
-; CHECK-BASELINE-NEXT: movb %dl, 28(%r13)
-; CHECK-BASELINE-NEXT: movb %sil, 27(%r13)
-; CHECK-BASELINE-NEXT: movb %dil, 26(%r13)
-; CHECK-BASELINE-NEXT: movb %r8b, 25(%r13)
-; CHECK-BASELINE-NEXT: movb %r9b, 24(%r13)
-; CHECK-BASELINE-NEXT: movb %r11b, 23(%r13)
-; CHECK-BASELINE-NEXT: movb %bl, 22(%r13)
-; CHECK-BASELINE-NEXT: movb %bpl, 21(%r13)
+; CHECK-BASELINE-NEXT: movzbl 31(%rbx), %r10d
+; CHECK-BASELINE-NEXT: movzbl 31(%r14), %ebx
+; CHECK-BASELINE-NEXT: xorb %r10b, %bl
+; CHECK-BASELINE-NEXT: andb 31(%r15), %bl
+; CHECK-BASELINE-NEXT: xorb %r10b, %bl
+; CHECK-BASELINE-NEXT: movb %bl, 31(%r12)
+; CHECK-BASELINE-NEXT: movb %al, 30(%r12)
+; CHECK-BASELINE-NEXT: movb %cl, 29(%r12)
+; CHECK-BASELINE-NEXT: movb %dl, 28(%r12)
+; CHECK-BASELINE-NEXT: movb %sil, 27(%r12)
+; CHECK-BASELINE-NEXT: movb %dil, 26(%r12)
+; CHECK-BASELINE-NEXT: movb %r8b, 25(%r12)
+; CHECK-BASELINE-NEXT: movb %r9b, 24(%r12)
+; CHECK-BASELINE-NEXT: movb %r11b, 23(%r12)
+; CHECK-BASELINE-NEXT: movb %bpl, 22(%r12)
+; CHECK-BASELINE-NEXT: movb %r13b, 21(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 20(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 20(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 19(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 19(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 18(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 18(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 17(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 17(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 16(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 16(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 15(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 15(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 14(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 14(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 13(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 13(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 12(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 12(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 11(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 11(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 10(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 10(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 9(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 9(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 8(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 8(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 7(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 7(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 6(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 6(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 5(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 5(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 4(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 4(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 3(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 3(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 2(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 2(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, 1(%r13)
+; CHECK-BASELINE-NEXT: movb %al, 1(%r12)
; CHECK-BASELINE-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-BASELINE-NEXT: movb %al, (%r13)
-; CHECK-BASELINE-NEXT: movq %r13, %rax
+; CHECK-BASELINE-NEXT: movb %al, (%r12)
+; CHECK-BASELINE-NEXT: movq %r12, %rax
; CHECK-BASELINE-NEXT: popq %rbx
; CHECK-BASELINE-NEXT: popq %r12
; CHECK-BASELINE-NEXT: popq %r13
@@ -3498,10 +3522,18 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: pushq %r13
; CHECK-SSE1-NEXT: pushq %r12
; CHECK-SSE1-NEXT: pushq %rbx
-; CHECK-SSE1-NEXT: movq %rcx, %r12
-; CHECK-SSE1-NEXT: movq %rdx, %r15
+; CHECK-SSE1-NEXT: movq %rcx, %r15
+; CHECK-SSE1-NEXT: movq %rdx, %rbx
; CHECK-SSE1-NEXT: movq %rsi, %r14
-; CHECK-SSE1-NEXT: movq %rdi, %r13
+; CHECK-SSE1-NEXT: movq %rdi, %r12
+; CHECK-SSE1-NEXT: movzbl 19(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 18(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 17(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 16(%rdx), %eax
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 15(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 14(%rdx), %eax
@@ -3514,241 +3546,241 @@ define <32 x i8> @in_v32i8(ptr%px, ptr%py, ptr%pmask) nounwind {
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%rdx), %eax
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r8d
-; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r9d
-; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r10d
-; CHECK-SSE1-NEXT: movzbl 6(%rdx), %ebp
-; CHECK-SSE1-NEXT: movzbl 5(%rdx), %edi
+; CHECK-SSE1-NEXT: movzbl 9(%rdx), %r13d
+; CHECK-SSE1-NEXT: movzbl 8(%rdx), %r10d
+; CHECK-SSE1-NEXT: movzbl 7(%rdx), %r9d
+; CHECK-SSE1-NEXT: movzbl 6(%rdx), %r8d
+; CHECK-SSE1-NEXT: movzbl 5(%rdx), %ebp
; CHECK-SSE1-NEXT: movzbl 4(%rdx), %esi
-; CHECK-SSE1-NEXT: movzbl 3(%rdx), %eax
-; CHECK-SSE1-NEXT: movzbl 2(%rdx), %ecx
-; CHECK-SSE1-NEXT: movzbl (%rdx), %r11d
-; CHECK-SSE1-NEXT: movzbl 1(%rdx), %edx
-; CHECK-SSE1-NEXT: movzbl (%r14), %ebx
-; CHECK-SSE1-NEXT: xorb %r11b, %bl
-; CHECK-SSE1-NEXT: andb (%r12), %bl
-; CHECK-SSE1-NEXT: xorb %r11b, %bl
-; CHECK-SSE1-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 1(%r14), %r11d
-; CHECK-SSE1-NEXT: xorb %dl, %r11b
-; CHECK-SSE1-NEXT: andb 1(%r12), %r11b
-; CHECK-SSE1-NEXT: xorb %dl, %r11b
+; CHECK-SSE1-NEXT: movzbl 3(%rdx), %edx
+; CHECK-SSE1-NEXT: movzbl 2(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl (%rbx), %edi
+; CHECK-SSE1-NEXT: movzbl 1(%rbx), %ecx
+; CHECK-SSE1-NEXT: movzbl (%r14), %r11d
+; CHECK-SSE1-NEXT: xorb %dil, %r11b
+; CHECK-SSE1-NEXT: andb (%r15), %r11b
+; CHECK-SSE1-NEXT: xorb %dil, %r11b
; CHECK-SSE1-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 2(%r14), %edx
-; CHECK-SSE1-NEXT: xorb %cl, %dl
-; CHECK-SSE1-NEXT: andb 2(%r12), %dl
-; CHECK-SSE1-NEXT: xorb %cl, %dl
-; CHECK-SSE1-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 3(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl 1(%r14), %edi
+; CHECK-SSE1-NEXT: xorb %cl, %dil
+; CHECK-SSE1-NEXT: andb 1(%r15), %dil
+; CHECK-SSE1-NEXT: xorb %cl, %dil
+; CHECK-SSE1-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 2(%r14), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 3(%r12), %cl
+; CHECK-SSE1-NEXT: andb 2(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 3(%r14), %eax
+; CHECK-SSE1-NEXT: xorb %dl, %al
+; CHECK-SSE1-NEXT: andb 3(%r15), %al
+; CHECK-SSE1-NEXT: xorb %dl, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 4(%r14), %eax
; CHECK-SSE1-NEXT: xorb %sil, %al
-; CHECK-SSE1-NEXT: andb 4(%r12), %al
+; CHECK-SSE1-NEXT: andb 4(%r15), %al
; CHECK-SSE1-NEXT: xorb %sil, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 5(%r14), %eax
-; CHECK-SSE1-NEXT: xorb %dil, %al
-; CHECK-SSE1-NEXT: andb 5(%r12), %al
-; CHECK-SSE1-NEXT: xorb %dil, %al
-; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax
; CHECK-SSE1-NEXT: xorb %bpl, %al
-; CHECK-SSE1-NEXT: andb 6(%r12), %al
+; CHECK-SSE1-NEXT: andb 5(%r15), %al
; CHECK-SSE1-NEXT: xorb %bpl, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax
-; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: andb 7(%r12), %al
-; CHECK-SSE1-NEXT: xorb %r10b, %al
+; CHECK-SSE1-NEXT: movzbl 6(%r14), %eax
+; CHECK-SSE1-NEXT: xorb %r8b, %al
+; CHECK-SSE1-NEXT: andb 6(%r15), %al
+; CHECK-SSE1-NEXT: xorb %r8b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax
+; CHECK-SSE1-NEXT: movzbl 7(%r14), %eax
; CHECK-SSE1-NEXT: xorb %r9b, %al
-; CHECK-SSE1-NEXT: andb 8(%r12), %al
+; CHECK-SSE1-NEXT: andb 7(%r15), %al
; CHECK-SSE1-NEXT: xorb %r9b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; CHECK-SSE1-NEXT: movzbl 8(%r14), %eax
+; CHECK-SSE1-NEXT: xorb %r10b, %al
+; CHECK-SSE1-NEXT: andb 8(%r15), %al
+; CHECK-SSE1-NEXT: xorb %r10b, %al
+; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 9(%r14), %eax
-; CHECK-SSE1-NEXT: xorb %r8b, %al
-; CHECK-SSE1-NEXT: andb 9(%r12), %al
-; CHECK-SSE1-NEXT: xorb %r8b, %al
+; CHECK-SSE1-NEXT: xorb %r13b, %al
+; CHECK-SSE1-NEXT: andb 9(%r15), %al
+; CHECK-SSE1-NEXT: xorb %r13b, %al
; CHECK-SSE1-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 10(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 10(%r12), %cl
+; CHECK-SSE1-NEXT: andb 10(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 11(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 11(%r12), %cl
+; CHECK-SSE1-NEXT: andb 11(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 12(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 12(%r12), %cl
+; CHECK-SSE1-NEXT: andb 12(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 13(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 13(%r12), %cl
+; CHECK-SSE1-NEXT: andb 13(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 14(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 14(%r12), %cl
+; CHECK-SSE1-NEXT: andb 14(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; CHECK-SSE1-NEXT: movzbl 15(%r14), %ecx
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 15(%r12), %cl
+; CHECK-SSE1-NEXT: andb 15(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 16(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 16(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 16(%r12), %cl
+; CHECK-SSE1-NEXT: andb 16(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 17(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 17(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 17(%r12), %cl
+; CHECK-SSE1-NEXT: andb 17(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 18(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 18(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 18(%r12), %cl
+; CHECK-SSE1-NEXT: andb 18(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 19(%r15), %eax
; CHECK-SSE1-NEXT: movzbl 19(%r14), %ecx
+; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 19(%r12), %cl
+; CHECK-SSE1-NEXT: andb 19(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 20(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 20(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 20(%r14), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 20(%r12), %cl
+; CHECK-SSE1-NEXT: andb 20(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
; CHECK-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; CHECK-SSE1-NEXT: movzbl 21(%r15), %eax
-; CHECK-SSE1-NEXT: movzbl 21(%r14), %ebp
+; CHECK-SSE1-NEXT: movzbl 21(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 21(%r14), %r13d
+; CHECK-SSE1-NEXT: xorb %al, %r13b
+; CHECK-SSE1-NEXT: andb 21(%r15), %r13b
+; CHECK-SSE1-NEXT: xorb %al, %r13b
+; CHECK-SSE1-NEXT: movzbl 22(%rbx), %eax
+; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebp
; CHECK-SSE1-NEXT: xorb %al, %bpl
-; CHECK-SSE1-NEXT: andb 21(%r12), %bpl
+; CHECK-SSE1-NEXT: andb 22(%r15), %bpl
; CHECK-SSE1-NEXT: xorb %al, %bpl
-; CHECK-SSE1-NEXT: movzbl 22(%r15), %eax
-; CHECK-SSE1-NEXT: movzbl 22(%r14), %ebx
-; CHECK-SSE1-NEXT: xorb %al, %bl
-; CHECK-SSE1-NEXT: andb 22(%r12), %bl
-; CHECK-SSE1-NEXT: xorb %al, %bl
-; CHECK-SSE1-NEXT: movzbl 23(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 23(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 23(%r14), %r11d
; CHECK-SSE1-NEXT: xorb %al, %r11b
-; CHECK-SSE1-NEXT: andb 23(%r12), %r11b
+; CHECK-SSE1-NEXT: andb 23(%r15), %r11b
; CHECK-SSE1-NEXT: xorb %al, %r11b
-; CHECK-SSE1-NEXT: movzbl 24(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 24(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 24(%r14), %r9d
; CHECK-SSE1-NEXT: xorb %al, %r9b
-; CHECK-SSE1-NEXT: andb 24(%r12), %r9b
+; CHECK-SSE1-NEXT: andb 24(%r15), %r9b
; CHECK-SSE1-NEXT: xorb %al, %r9b
-; CHECK-SSE1-NEXT: movzbl 25(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 25(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 25(%r14), %r8d
; CHECK-SSE1-NEXT: xorb %al, %r8b
-; CHECK-SSE1-NEXT: andb 25(%r12), %r8b
+; CHECK-SSE1-NEXT: andb 25(%r15), %r8b
; CHECK-SSE1-NEXT: xorb %al, %r8b
-; CHECK-SSE1-NEXT: movzbl 26(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 26(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 26(%r14), %edi
; CHECK-SSE1-NEXT: xorb %al, %dil
-; CHECK-SSE1-NEXT: andb 26(%r12), %dil
+; CHECK-SSE1-NEXT: andb 26(%r15), %dil
; CHECK-SSE1-NEXT: xorb %al, %dil
-; CHECK-SSE1-NEXT: movzbl 27(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 27(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 27(%r14), %esi
; CHECK-SSE1-NEXT: xorb %al, %sil
-; CHECK-SSE1-NEXT: andb 27(%r12), %sil
+; CHECK-SSE1-NEXT: andb 27(%r15), %sil
; CHECK-SSE1-NEXT: xorb %al, %sil
-; CHECK-SSE1-NEXT: movzbl 28(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 28(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 28(%r14), %edx
; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: andb 28(%r12), %dl
+; CHECK-SSE1-NEXT: andb 28(%r15), %dl
; CHECK-SSE1-NEXT: xorb %al, %dl
-; CHECK-SSE1-NEXT: movzbl 29(%r15), %eax
+; CHECK-SSE1-NEXT: movzbl 29(%rbx), %eax
; CHECK-SSE1-NEXT: movzbl 29(%r14), %ecx
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: andb 29(%r12), %cl
+; CHECK-SSE1-NEXT: andb 29(%r15), %cl
; CHECK-SSE1-NEXT: xorb %al, %cl
-; CHECK-SSE1-NEXT: movzbl 30(%r15), %r10d
+; CHECK-SSE1-NEXT: movzbl 30(%rbx), %r10d
; CHECK-SSE1-NEXT: movzbl 30(%r14), %eax
; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: andb 30(%r12), %al
+; CHECK-SSE1-NEXT: andb 30(%r15), %al
; CHECK-SSE1-NEXT: xorb %r10b, %al
-; CHECK-SSE1-NEXT: movzbl 31(%r15), %r10d
-; CHECK-SSE1-NEXT: movzbl 31(%r14), %r14d
-; CHECK-SSE1-NEXT: xorb %r10b, %r14b
-; CHECK-SSE1-NEXT: andb 31(%r12), %r14b
-; CHECK-SSE1-NEXT: xorb %r10b, %r14b
-; CHECK-SSE1-NEXT: movb %r14b, 31(%r13)
-; CHECK-SSE1-NEXT: movb %al, 30(%r13)
-; CHECK-SSE1-NEXT: movb %cl, 29(%r13)
-; CHECK-SSE1-NEXT: movb %dl, 28(%r13)
-; CHECK-SSE1-NEXT: movb %sil, 27(%r13)
-; CHECK-SSE1-NEXT: movb %dil, 26(%r13)
-; CHECK-SSE1-NEXT: movb %r8b, 25(%r13)
-; CHECK-SSE1-NEXT: movb %r9b, 24(%r13)
-; CHECK-SSE1-NEXT: movb %r11b, 23(%r13)
-; CHECK-SSE1-NEXT: movb %bl, 22(%r13)
-; CHECK-SSE1-NEXT: movb %bpl, 21(%r13)
+; CHECK-SSE1-NEXT: movzbl 31(%rbx), %r10d
+; CHECK-SSE1-NEXT: movzbl 31(%r14), %ebx
+; CHECK-SSE1-NEXT: xorb %r10b, %bl
+; CHECK-SSE1-NEXT: andb 31(%r15), %bl
+; CHECK-SSE1-NEXT: xorb %r10b, %bl
+; CHECK-SSE1-NEXT: movb %bl, 31(%r12)
+; CHECK-SSE1-NEXT: movb %al, 30(%r12)
+; CHECK-SSE1-NEXT: movb %cl, 29(%r12)
+; CHECK-SSE1-NEXT: movb %dl, 28(%r12)
+; CHECK-SSE1-NEXT: movb %sil, 27(%r12)
+; CHECK-SSE1-NEXT: movb %dil, 26(%r12)
+; CHECK-SSE1-NEXT: movb %r8b, 25(%r12)
+; CHECK-SSE1-NEXT: movb %r9b, 24(%r12)
+; CHECK-SSE1-NEXT: movb %r11b, 23(%r12)
+; CHECK-SSE1-NEXT: movb %bpl, 22(%r12)
+; CHECK-SSE1-NEXT: movb %r13b, 21(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 20(%r13)
+; CHECK-SSE1-NEXT: movb %al, 20(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 19(%r13)
+; CHECK-SSE1-NEXT: movb %al, 19(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 18(%r13)
+; CHECK-SSE1-NEXT: movb %al, 18(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 17(%r13)
+; CHECK-SSE1-NEXT: movb %al, 17(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 16(%r13)
+; CHECK-SSE1-NEXT: movb %al, 16(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 15(%r13)
+; CHECK-SSE1-NEXT: movb %al, 15(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 14(%r13)
+; CHECK-SSE1-NEXT: movb %al, 14(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 13(%r13)
+; CHECK-SSE1-NEXT: movb %al, 13(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 12(%r13)
+; CHECK-SSE1-NEXT: movb %al, 12(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 11(%r13)
+; CHECK-SSE1-NEXT: movb %al, 11(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 10(%r13)
+; CHECK-SSE1-NEXT: movb %al, 10(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 9(%r13)
+; CHECK-SSE1-NEXT: movb %al, 9(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 8(%r13)
+; CHECK-SSE1-NEXT: movb %al, 8(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 7(%r13)
+; CHECK-SSE1-NEXT: movb %al, 7(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 6(%r13)
+; CHECK-SSE1-NEXT: movb %al, 6(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 5(%r13)
+; CHECK-SSE1-NEXT: movb %al, 5(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 4(%r13)
+; CHECK-SSE1-NEXT: movb %al, 4(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 3(%r13)
+; CHECK-SSE1-NEXT: movb %al, 3(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 2(%r13)
+; CHECK-SSE1-NEXT: movb %al, 2(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, 1(%r13)
+; CHECK-SSE1-NEXT: movb %al, 1(%r12)
; CHECK-SSE1-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; CHECK-SSE1-NEXT: movb %al, (%r13)
-; CHECK-SSE1-NEXT: movq %r13, %rax
+; CHECK-SSE1-NEXT: movb %al, (%r12)
+; CHECK-SSE1-NEXT: movq %r12, %rax
; CHECK-SSE1-NEXT: popq %rbx
; CHECK-SSE1-NEXT: popq %r12
; CHECK-SSE1-NEXT: popq %r13
diff --git a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll
index e3bc77d4d5fa28..d9dc117397b4a1 100644
--- a/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/x86-64-flags-intrinsics.ll
@@ -91,11 +91,11 @@ define i64 @read_flags_reg_pressure() nounwind {
; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rbp
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
-; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-NEXT: pushfq
-; WIN64-NEXT: popq %rdx
-; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WIN64-NEXT: popq %rcx
+; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
@@ -168,11 +168,11 @@ define void @write_flags_reg_pressure(i64 noundef %0) nounwind {
; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
-; WIN64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; WIN64-NEXT: pushq %rdx
+; WIN64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; WIN64-NEXT: pushq %rcx
; WIN64-NEXT: popfq
-; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; WIN64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; WIN64-NEXT: #APP
; WIN64-NEXT: #NO_APP
; WIN64-NEXT: addq $16, %rsp
diff --git a/llvm/test/TableGen/bare-minimum-psets.td b/llvm/test/TableGen/bare-minimum-psets.td
index 25e0bd2a83d1d5..170838dd5f01c3 100644
--- a/llvm/test/TableGen/bare-minimum-psets.td
+++ b/llvm/test/TableGen/bare-minimum-psets.td
@@ -55,7 +55,7 @@ def MyTarget : Target;
// CHECK-NEXT: }
// CHECK: unsigned MyTargetGenRegisterInfo::
-// CHECK-NEXT: getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const {
+// CHECK-NEXT: getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const {
// CHECK-NEXT: static const uint8_t PressureLimitTable[] = {
// CHECK-NEXT: {{[0-9]+}}, // 0: D_32
// CHECK-NEXT: };
diff --git a/llvm/test/TableGen/inhibit-pset.td b/llvm/test/TableGen/inhibit-pset.td
index 1f4f8a176c62c9..b3443f1938e8c4 100644
--- a/llvm/test/TableGen/inhibit-pset.td
+++ b/llvm/test/TableGen/inhibit-pset.td
@@ -15,7 +15,7 @@ def X0 : Register <"x0">;
// CHECK-NEXT: }
// CHECK: unsigned TestTargetGenRegisterInfo::
-// CHECK-NEXT: getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const {
+// CHECK-NEXT: getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const {
// CHECK-NEXT: static const uint16_t PressureLimitTable[] = {
// CHECK-NEXT: {{[0-9]+}}, // 0: GPR32
// CHECK-NEXT: };
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 67759bd5c4632e..08bb859dee40c9 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -50,8 +50,8 @@ public:
const char *getRegPressureSetName(unsigned Idx) const override {
return "bogus";
}
- unsigned getRegPressureSetLimit(const MachineFunction &MF,
- unsigned Idx) const override {
+ unsigned getRawRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const override {
return 0;
}
const int *
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index a6f87119aca5ba..79c2eb2acc5646 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -275,8 +275,8 @@ void RegisterInfoEmitter::EmitRegUnitPressure(raw_ostream &OS,
OS << "// Get the register unit pressure limit for this dimension.\n"
<< "// This limit must be adjusted dynamically for reserved registers.\n"
<< "unsigned " << ClassName << "::\n"
- << "getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const "
- "{\n"
+ << "getRawRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) "
+ "const {\n"
<< " static const " << getMinimalTypeForRange(MaxRegUnitWeight, 32)
<< " PressureLimitTable[] = {\n";
for (unsigned i = 0; i < NumSets; ++i) {
@@ -1129,7 +1129,8 @@ void RegisterInfoEmitter::runTargetHeader(raw_ostream &OS) {
<< " unsigned getRegUnitWeight(unsigned RegUnit) const override;\n"
<< " unsigned getNumRegPressureSets() const override;\n"
<< " const char *getRegPressureSetName(unsigned Idx) const override;\n"
- << " unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned "
+ << " unsigned getRawRegPressureSetLimit(const MachineFunction &MF, "
+ "unsigned "
"Idx) const override;\n"
<< " const int *getRegClassPressureSets("
<< "const TargetRegisterClass *RC) const override;\n"
More information about the llvm-commits
mailing list