[llvm] [RegAlloc] Remove default restriction on non-trivial rematerialization (PR #159211)

Luke Lau via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 1 05:48:45 PDT 2025


https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/159211

>From 6c471821f58a84e3fa5c578fd0a71e771af1b35d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 17 Sep 2025 04:01:57 +0800
Subject: [PATCH 1/2] [RegAlloc] Allow rematerialization with virtual reg uses

Stacked on #159180.

Unless overridden by the target, we currently only allow rematerlization of instructions with immediate or constant physical register operands, i.e. no virtual registers.

The comment states that this is because we might increase a live range of the virtual register, but we don't actually do this. LiveRangeEdit::allUsesAvailableAt makes sure that we only rematerialize instructions whose virtual registers are already live at the use sites.

This patch relaxes this constraint which reduces a significant amount of reloads across various targets.

This is another attempt at https://reviews.llvm.org/D106408, but #159180 aims to have addressed the issue with the weights that may have caused the previous regressions.
---
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |   6 -
 .../GlobalISel/split-wide-shifts-multiway.ll  | 812 +++++++++---------
 .../AArch64/aarch64-matrix-umull-smull.ll     |  12 +-
 .../CodeGen/AArch64/machine-combiner-copy.ll  |   2 +-
 .../CodeGen/AArch64/machine-licm-sub-loop.ll  |  47 +-
 llvm/test/CodeGen/AArch64/peephole-and-tst.ll |   4 +-
 .../AArch64/reserveXreg-for-regalloc.ll       |   6 +-
 llvm/test/CodeGen/AArch64/tbl-loops.ll        |   8 +-
 llvm/test/CodeGen/ARM/combine-movc-sub.ll     |  12 +-
 llvm/test/CodeGen/RISCV/pr69586.ll            | 204 ++---
 .../Thumb2/LowOverheadLoops/reductions.ll     |   6 +-
 .../Thumb2/LowOverheadLoops/spillingmove.ll   | 151 ++--
 .../varying-outer-2d-reduction.ll             |  80 +-
 .../LowOverheadLoops/vcmp-vpst-combination.ll |  13 +-
 .../CodeGen/Thumb2/mve-float16regloops.ll     |  82 +-
 .../CodeGen/Thumb2/mve-float32regloops.ll     | 100 ++-
 .../CodeGen/Thumb2/mve-gather-increment.ll    | 278 +++---
 llvm/test/CodeGen/Thumb2/mve-phireg.ll        |  30 +-
 llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll   | 519 +++++------
 llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll   |  22 +-
 .../CodeGen/Thumb2/mve-scatter-increment.ll   |  10 +-
 .../CodeGen/Thumb2/mve-vecreduce-addpred.ll   |  16 +-
 .../CodeGen/Thumb2/mve-vecreduce-mlapred.ll   |  92 +-
 .../X86/AMX/amx-greedy-ra-spill-shape.ll      |  82 +-
 .../CodeGen/X86/dag-update-nodetomatch.ll     |   5 +-
 .../X86/delete-dead-instrs-with-live-uses.mir |   4 +-
 llvm/test/CodeGen/X86/inalloca-invoke.ll      |   2 +-
 llvm/test/CodeGen/X86/licm-regpressure.ll     |  62 +-
 28 files changed, 1320 insertions(+), 1347 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 2f3b7a2c8fcdf..3c41bbeb4b327 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -1657,12 +1657,6 @@ bool TargetInstrInfo::isReMaterializableImpl(
     // same virtual register, though.
     if (MO.isDef() && Reg != DefReg)
       return false;
-
-    // Don't allow any virtual-register uses. Rematting an instruction with
-    // virtual register uses would length the live ranges of the uses, which
-    // is not necessarily a good idea, certainly not "trivial".
-    if (MO.isUse())
-      return false;
   }
 
   // Everything checked out.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index ed68723e470a2..41f7ab89094ad 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ;
 ; GISEL-LABEL: test_shl_i1024:
 ; GISEL:       ; %bb.0: ; %entry
-; GISEL-NEXT:    sub sp, sp, #416
-; GISEL-NEXT:    stp x28, x27, [sp, #320] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x26, x25, [sp, #336] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x24, x23, [sp, #352] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x22, x21, [sp, #368] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x20, x19, [sp, #384] ; 16-byte Folded Spill
-; GISEL-NEXT:    stp x29, x30, [sp, #400] ; 16-byte Folded Spill
-; GISEL-NEXT:    .cfi_def_cfa_offset 416
+; GISEL-NEXT:    sub sp, sp, #432
+; GISEL-NEXT:    stp x28, x27, [sp, #336] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x26, x25, [sp, #352] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x24, x23, [sp, #368] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x22, x21, [sp, #384] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x20, x19, [sp, #400] ; 16-byte Folded Spill
+; GISEL-NEXT:    stp x29, x30, [sp, #416] ; 16-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 432
 ; GISEL-NEXT:    .cfi_offset w30, -8
 ; GISEL-NEXT:    .cfi_offset w29, -16
 ; GISEL-NEXT:    .cfi_offset w19, -24
@@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    ldp x10, x11, [x1]
 ; GISEL-NEXT:    mov w8, w2
 ; GISEL-NEXT:    lsr x9, x8, #6
-; GISEL-NEXT:    and x16, x8, #0x3f
+; GISEL-NEXT:    and x12, x8, #0x3f
+; GISEL-NEXT:    str x0, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x14, x8, #0x3f
 ; GISEL-NEXT:    mov w13, #64 ; =0x40
-; GISEL-NEXT:    sub x21, x13, x16
-; GISEL-NEXT:    str x0, [sp, #112] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x24, x16
-; GISEL-NEXT:    lsl x25, x10, x16
+; GISEL-NEXT:    and x16, x8, #0x3f
+; GISEL-NEXT:    lsl x0, x10, x12
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    lsr x26, x10, x21
-; GISEL-NEXT:    lsl x2, x11, x16
-; GISEL-NEXT:    lsr x23, x11, x21
-; GISEL-NEXT:    mov x22, x21
-; GISEL-NEXT:    csel x12, x25, xzr, eq
+; GISEL-NEXT:    sub x2, x13, x14
+; GISEL-NEXT:    lsr x3, x10, x2
+; GISEL-NEXT:    lsl x6, x11, x14
+; GISEL-NEXT:    and x14, x8, #0x3f
+; GISEL-NEXT:    csel x12, x0, xzr, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x1, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT:    lsr x20, x11, x2
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    str x23, [sp, #208] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x24, x0
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    stp x24, x22, [sp, #40] ; 16-byte Folded Spill
+; GISEL-NEXT:    mov x7, x3
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #4
+; GISEL-NEXT:    mov x28, x1
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #5
+; GISEL-NEXT:    and x21, x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #6
+; GISEL-NEXT:    str x6, [sp, #24] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #7
+; GISEL-NEXT:    str x28, [sp, #304] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #8
+; GISEL-NEXT:    str x7, [sp, #272] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #9
+; GISEL-NEXT:    str x20, [sp, #112] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x10, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #192] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x26, eq
+; GISEL-NEXT:    str x10, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x10, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x10, x2, x10
+; GISEL-NEXT:    orr x10, x6, x10
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x0, x10, eq
 ; GISEL-NEXT:    cmp x9, #2
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #3
@@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    lsl x20, x12, x16
+; GISEL-NEXT:    lsl x26, x12, x14
 ; GISEL-NEXT:    csel x11, x11, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #184] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    str x11, [sp, #224] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x11, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x20, x11
-; GISEL-NEXT:    lsr x15, x12, x21
-; GISEL-NEXT:    lsl x14, x10, x16
+; GISEL-NEXT:    orr x11, x26, x11
+; GISEL-NEXT:    lsr x15, x12, x2
+; GISEL-NEXT:    lsl x30, x10, x16
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x17, x10, x21
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    lsr x17, x10, x2
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x20, [sp, #8] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x0, x11, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #4
@@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #216] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x14, x11
+; GISEL-NEXT:    orr x11, x30, x11
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x0, x11, eq
 ; GISEL-NEXT:    cmp x9, #4
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #5
@@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    lsl x0, x12, x16
 ; GISEL-NEXT:    csel x10, x10, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x10, [sp, #208] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    orr x10, x0, x10
-; GISEL-NEXT:    lsr x27, x12, x21
+; GISEL-NEXT:    lsr x4, x12, x2
 ; GISEL-NEXT:    lsl x19, x11, x16
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x3, x11, x21
+; GISEL-NEXT:    mov x16, x15
 ; GISEL-NEXT:    csel x13, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    stp x27, x0, [sp, #240] ; 16-byte Folded Spill
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    mov x7, x3
+; GISEL-NEXT:    str x4, [sp, #248] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    str x0, [sp, #48] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x24, x10, eq
 ; GISEL-NEXT:    cmp x9, #5
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #6
@@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #160] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x27, eq
+; GISEL-NEXT:    str x10, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x10, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #0
 ; GISEL-NEXT:    orr x10, x19, x10
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
@@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    and x15, x8, #0x3f
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    lsr x3, x11, x2
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x10, x12, x10, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    csel x10, x25, x10, eq
+; GISEL-NEXT:    csel x10, x24, x10, eq
 ; GISEL-NEXT:    cmp x9, #6
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #7
@@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    lsl x4, x12, x16
+; GISEL-NEXT:    lsl x22, x12, x15
 ; GISEL-NEXT:    csel x11, x11, x13, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #192] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x4, x11
-; GISEL-NEXT:    lsl x30, x10, x16
-; GISEL-NEXT:    lsr x28, x10, x21
+; GISEL-NEXT:    orr x11, x22, x11
+; GISEL-NEXT:    lsl x5, x10, x15
+; GISEL-NEXT:    lsr x27, x10, x2
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x30, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x25, x27
 ; GISEL-NEXT:    orr x13, x19, x13
+; GISEL-NEXT:    mov x14, x5
+; GISEL-NEXT:    str x27, [sp, #328] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x17, eq
@@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    orr x13, x30, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x11, x13, x11, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    lsr x13, x12, x21
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    lsr x13, x12, x2
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    mov x6, x13
+; GISEL-NEXT:    mov x15, x13
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    str x6, [sp, #256] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
@@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #184] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x30, x11
+; GISEL-NEXT:    orr x11, x5, x11
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x12, x4, x12
+; GISEL-NEXT:    orr x12, x22, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x27, eq
+; GISEL-NEXT:    csel x12, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #2
 ; GISEL-NEXT:    orr x12, x19, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
@@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x12, x0, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x12, x20, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    orr x12, x6, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
@@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #14
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
-; GISEL-NEXT:    ldp x11, x5, [x1, #64]
+; GISEL-NEXT:    ldp x11, x1, [x1, #64]
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x12, x10, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsl x21, x11, x16
-; GISEL-NEXT:    str x12, [sp, #136] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x12, xzr, x28, eq
+; GISEL-NEXT:    lsl x23, x11, x21
+; GISEL-NEXT:    str x12, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    lsr x10, x11, x22
-; GISEL-NEXT:    mov x16, x19
+; GISEL-NEXT:    orr x12, x23, x12
+; GISEL-NEXT:    lsr x21, x11, x2
+; GISEL-NEXT:    str x23, [sp, #288] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x12, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    mov x1, x16
 ; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    str x16, [sp, #304] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    orr x13, x5, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    lsl x3, x5, x24
-; GISEL-NEXT:    orr x13, x4, x13
+; GISEL-NEXT:    orr x13, x22, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    stp x21, x3, [sp, #216] ; 16-byte Folded Spill
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x13, x19, x13
-; GISEL-NEXT:    mov x19, x28
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x17, eq
@@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    orr x13, x30, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x23, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x20, x13
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x6, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    csel x12, x25, x12, eq
+; GISEL-NEXT:    and x13, x8, #0x3f
+; GISEL-NEXT:    csel x12, x24, x12, eq
 ; GISEL-NEXT:    cmp x9, #9
+; GISEL-NEXT:    lsl x10, x1, x13
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #11
+; GISEL-NEXT:    stp x10, x15, [sp, #312] ; 16-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x11, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #128] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x10, eq
+; GISEL-NEXT:    str x11, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x11, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x10, x11
+; GISEL-NEXT:    mov x10, x23
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x28, eq
+; GISEL-NEXT:    csel x12, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    mov x28, x4
-; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    str x28, [sp, #32] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x27, x24
+; GISEL-NEXT:    orr x12, x23, x12
+; GISEL-NEXT:    mov x23, x15
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x6, eq
+; GISEL-NEXT:    csel x12, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x12, x30, x12
+; GISEL-NEXT:    mov x15, x22
+; GISEL-NEXT:    orr x12, x5, x12
+; GISEL-NEXT:    mov x5, x3
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x7, eq
+; GISEL-NEXT:    stp x14, x5, [sp, #256] ; 16-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x12, x4, x12
-; GISEL-NEXT:    mov x4, x20
+; GISEL-NEXT:    mov x5, x4
+; GISEL-NEXT:    orr x12, x22, x12
+; GISEL-NEXT:    lsr x22, x1, x2
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x27, eq
+; GISEL-NEXT:    csel x12, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x27, x2
-; GISEL-NEXT:    orr x12, x16, x12
-; GISEL-NEXT:    mov x16, x17
+; GISEL-NEXT:    str x22, [sp, #240] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x19, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x12, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    mov x17, x15
 ; GISEL-NEXT:    orr x12, x0, x12
-; GISEL-NEXT:    lsr x0, x5, x22
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    ldr x15, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x12, x14, x12
-; GISEL-NEXT:    str x0, [sp, #280] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x30, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x23, eq
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    mov x23, x25
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    str x23, [sp, #288] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x26, eq
+; GISEL-NEXT:    csel x12, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x12, x2, x12
-; GISEL-NEXT:    mov x2, x3
+; GISEL-NEXT:    mov x7, x14
+; GISEL-NEXT:    orr x12, x6, x12
+; GISEL-NEXT:    mov x6, x28
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    csel x11, x24, x11, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    mov x25, x26
+; GISEL-NEXT:    ldr x24, [x6, #88]
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #11
+; GISEL-NEXT:    ldr x6, [sp, #272] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
@@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
+; GISEL-NEXT:    ldr x11, [x28, #80]
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x12, x5, x12, eq
-; GISEL-NEXT:    ldp x11, x5, [x15, #80]
+; GISEL-NEXT:    csel x12, x1, x12, eq
+; GISEL-NEXT:    mov x28, x2
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x12, [sp, #120] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x15, x7
-; GISEL-NEXT:    csel x12, xzr, x0, eq
+; GISEL-NEXT:    lsl x2, x11, x13
+; GISEL-NEXT:    str x12, [sp, #160] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    str x15, [sp, #24] ; 8-byte Folded Spill
-; GISEL-NEXT:    lsl x20, x11, x24
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    str x20, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    str x28, [sp, #16] ; 8-byte Folded Spill
+; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    str x2, [sp, #280] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x12, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x10, eq
+; GISEL-NEXT:    csel x13, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x13, x3, x13
-; GISEL-NEXT:    lsl x3, x5, x24
+; GISEL-NEXT:    orr x13, x1, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x19, eq
+; GISEL-NEXT:    csel x13, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    stp x19, x3, [sp, #264] ; 16-byte Folded Spill
-; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    mov x25, x16
+; GISEL-NEXT:    orr x13, x10, x13
+; GISEL-NEXT:    mov x10, x30
+; GISEL-NEXT:    str x25, [sp, #80] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x6, eq
+; GISEL-NEXT:    csel x13, xzr, x23, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    mov x23, x3
+; GISEL-NEXT:    orr x13, x14, x13
+; GISEL-NEXT:    mov x14, x17
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
-; GISEL-NEXT:    ldp x7, x30, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT:    stp x19, x14, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x13, x28, x13
+; GISEL-NEXT:    mov x3, x21
+; GISEL-NEXT:    orr x13, x15, x13
+; GISEL-NEXT:    str x3, [sp, #32] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
+; GISEL-NEXT:    csel x13, xzr, x4, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x13, x1, x13
-; GISEL-NEXT:    mov x1, x14
+; GISEL-NEXT:    mov x4, x0
+; GISEL-NEXT:    orr x13, x19, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x16, eq
+; GISEL-NEXT:    csel x13, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    mov x17, x27
+; GISEL-NEXT:    orr x13, x0, x13
+; GISEL-NEXT:    ldr x0, [sp, #24] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x17, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    ldr x14, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x30, x13
+; GISEL-NEXT:    ldp x30, x16, [sp, #320] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x14, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x13, x4, x13
-; GISEL-NEXT:    mov x4, x10
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x26, eq
+; GISEL-NEXT:    csel x13, xzr, x6, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    mov x26, x27
-; GISEL-NEXT:    orr x13, x27, x13
-; GISEL-NEXT:    lsr x27, x11, x22
+; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x12, x13, x12, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    mov x13, x23
-; GISEL-NEXT:    csel x12, x23, x12, eq
+; GISEL-NEXT:    lsr x13, x11, x28
+; GISEL-NEXT:    csel x12, x27, x12, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    str x27, [sp, #64] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    mov x23, x20
+; GISEL-NEXT:    str x13, [sp, #96] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
 ; GISEL-NEXT:    cmp x9, #13
 ; GISEL-NEXT:    csel x12, xzr, x12, eq
@@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x11, x11, x12, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x11, [sp, #104] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x11, xzr, x27, eq
+; GISEL-NEXT:    str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x11, x8, #0x3f
+; GISEL-NEXT:    lsl x27, x24, x11
+; GISEL-NEXT:    csel x11, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x27, x11
+; GISEL-NEXT:    str x27, [sp, #56] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x11, x11, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x0, eq
+; GISEL-NEXT:    csel x12, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    mov x0, x7
-; GISEL-NEXT:    orr x12, x20, x12
-; GISEL-NEXT:    mov x20, x16
+; GISEL-NEXT:    mov x22, x2
+; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    mov x2, x14
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x10, eq
+; GISEL-NEXT:    csel x12, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    ldr x10, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x12, x2, x12
-; GISEL-NEXT:    ldr x2, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x21, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x12, x1, x12
+; GISEL-NEXT:    mov x1, x27
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x19, eq
+; GISEL-NEXT:    csel x12, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x12, x21, x12
-; GISEL-NEXT:    ldr x21, [sp, #200] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x6, eq
+; GISEL-NEXT:    csel x12, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x12, x21, x12
+; GISEL-NEXT:    orr x12, x7, x12
+; GISEL-NEXT:    mov x7, x15
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x15, eq
+; GISEL-NEXT:    str x7, [sp, #40] ; 8-byte Folded Spill
+; GISEL-NEXT:    csel x12, xzr, x23, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x12, x28, x12
+; GISEL-NEXT:    orr x12, x15, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x7, eq
+; GISEL-NEXT:    csel x12, xzr, x5, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    mov x7, x17
-; GISEL-NEXT:    orr x12, x2, x12
+; GISEL-NEXT:    mov x5, x19
+; GISEL-NEXT:    orr x12, x19, x12
+; GISEL-NEXT:    mov x19, x7
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x16, eq
+; GISEL-NEXT:    csel x12, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x12, x30, x12
+; GISEL-NEXT:    lsr x14, x24, x28
+; GISEL-NEXT:    orr x12, x4, x12
+; GISEL-NEXT:    mov x4, x10
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x17, eq
+; GISEL-NEXT:    csel x12, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    mov x17, x24
-; GISEL-NEXT:    orr x12, x1, x12
+; GISEL-NEXT:    orr x12, x10, x12
+; GISEL-NEXT:    ldr x10, [sp, #304] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x14, eq
-; GISEL-NEXT:    ldr x14, [sp, #8] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x12, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x12, x14, x12
+; GISEL-NEXT:    orr x12, x26, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x12, xzr, x25, eq
+; GISEL-NEXT:    csel x12, xzr, x6, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x12, x26, x12
+; GISEL-NEXT:    orr x12, x0, x12
 ; GISEL-NEXT:    csel x11, x12, x11, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    csel x11, x13, x11, eq
+; GISEL-NEXT:    csel x11, x17, x11, eq
 ; GISEL-NEXT:    cmp x9, #12
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #13
@@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x12, xzr, x11, eq
+; GISEL-NEXT:    ldp x11, x6, [x10, #96]
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    ldp x11, x10, [x10, #96]
-; GISEL-NEXT:    csel x12, x5, x12, eq
-; GISEL-NEXT:    str x12, [sp, #96] ; 8-byte Folded Spill
-; GISEL-NEXT:    mov x12, x22
-; GISEL-NEXT:    lsr x22, x5, x22
-; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    mov x5, x27
-; GISEL-NEXT:    lsl x24, x11, x24
-; GISEL-NEXT:    str x10, [sp, #296] ; 8-byte Folded Spill
-; GISEL-NEXT:    csel x10, xzr, x22, eq
+; GISEL-NEXT:    and x10, x8, #0x3f
+; GISEL-NEXT:    csel x12, x24, x12, eq
+; GISEL-NEXT:    tst x8, #0x3f
+; GISEL-NEXT:    ldr x24, [sp, #248] ; 8-byte Folded Reload
+; GISEL-NEXT:    lsl x15, x11, x10
+; GISEL-NEXT:    csel x10, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    str x22, [sp, #16] ; 8-byte Folded Spill
-; GISEL-NEXT:    orr x10, x24, x10
+; GISEL-NEXT:    str x12, [sp, #136] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x12, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x10, x15, x10
+; GISEL-NEXT:    str x15, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT:    mov x15, x13
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x27, eq
+; GISEL-NEXT:    csel x13, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    ldr x27, [sp, #280] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x13, x3, x13
-; GISEL-NEXT:    mov x3, x26
+; GISEL-NEXT:    orr x13, x27, x13
+; GISEL-NEXT:    ldr x27, [sp, #240] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x13, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x13, x23, x13
-; GISEL-NEXT:    mov x23, x4
+; GISEL-NEXT:    orr x13, x22, x13
+; GISEL-NEXT:    ldr x22, [sp, #272] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x4, eq
-; GISEL-NEXT:    ldp x4, x16, [sp, #216] ; 16-byte Folded Reload
+; GISEL-NEXT:    csel x13, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x13, x16, x13
+; GISEL-NEXT:    orr x13, x12, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x19, eq
+; GISEL-NEXT:    csel x13, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x19, x1
-; GISEL-NEXT:    orr x13, x4, x13
+; GISEL-NEXT:    mov x16, x17
+; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    ldp x23, x21, [sp, #256] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x6, eq
+; GISEL-NEXT:    csel x13, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    mov x6, x14
-; GISEL-NEXT:    orr x13, x21, x13
+; GISEL-NEXT:    mov x30, x0
+; GISEL-NEXT:    orr x13, x23, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x15, eq
+; GISEL-NEXT:    csel x13, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x13, x28, x13
+; GISEL-NEXT:    orr x13, x7, x13
+; GISEL-NEXT:    mov x7, x14
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x0, eq
+; GISEL-NEXT:    csel x13, xzr, x24, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    mov x0, x23
-; GISEL-NEXT:    orr x13, x2, x13
+; GISEL-NEXT:    orr x13, x5, x13
+; GISEL-NEXT:    ldr x5, [sp, #48] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x20, eq
+; GISEL-NEXT:    csel x13, xzr, x2, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x13, x30, x13
-; GISEL-NEXT:    ldr x30, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x2, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x5, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x7, eq
+; GISEL-NEXT:    csel x13, xzr, x25, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x13, x1, x13
+; GISEL-NEXT:    mov x25, x6
+; GISEL-NEXT:    orr x13, x4, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x30, eq
+; GISEL-NEXT:    csel x13, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x13, x14, x13
-; GISEL-NEXT:    ldp x14, x2, [sp, #264] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x13, x26, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x13, xzr, x25, eq
+; GISEL-NEXT:    csel x13, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x13, x26, x13
-; GISEL-NEXT:    ldr x26, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x13, x0, x13
 ; GISEL-NEXT:    csel x10, x13, x10, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    lsr x13, x11, x12
-; GISEL-NEXT:    csel x10, x26, x10, eq
+; GISEL-NEXT:    lsr x13, x11, x28
+; GISEL-NEXT:    csel x10, x17, x10, eq
 ; GISEL-NEXT:    cmp x9, #13
+; GISEL-NEXT:    ldr x17, [sp, #80] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    str x13, [sp, #72] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x13, [sp, #104] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    str x10, [sp, #88] ; 8-byte Folded Spill
-; GISEL-NEXT:    ldr x10, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT:    lsl x11, x10, x17
+; GISEL-NEXT:    str x10, [sp, #128] ; 8-byte Folded Spill
+; GISEL-NEXT:    and x10, x8, #0x3f
+; GISEL-NEXT:    lsl x11, x6, x10
 ; GISEL-NEXT:    csel x10, xzr, x13, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    ldr x17, [sp, #232] ; 8-byte Folded Reload
-; GISEL-NEXT:    ldr x13, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x0, x13, [sp, #280] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x6, x16
 ; GISEL-NEXT:    orr x10, x11, x10
-; GISEL-NEXT:    str x11, [sp, #56] ; 8-byte Folded Spill
+; GISEL-NEXT:    str x11, [sp, #88] ; 8-byte Folded Spill
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x22, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x11, x24, x11
+; GISEL-NEXT:    orr x11, x2, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x5, eq
+; GISEL-NEXT:    csel x11, xzr, x15, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x11, x2, x11
-; GISEL-NEXT:    ldp x12, x5, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT:    mov x15, x3
+; GISEL-NEXT:    orr x11, x1, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    mov x27, x30
-; GISEL-NEXT:    orr x11, x17, x11
+; GISEL-NEXT:    orr x11, x0, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    csel x11, xzr, x3, eq
+; GISEL-NEXT:    ldp x14, x3, [sp, #320] ; 16-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    mov x23, x20
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x14, eq
+; GISEL-NEXT:    csel x11, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x11, x4, x11
+; GISEL-NEXT:    orr x11, x13, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x13, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x11, x21, x11
-; GISEL-NEXT:    ldr x21, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x23, x11
+; GISEL-NEXT:    mov x23, x5
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x15, eq
+; GISEL-NEXT:    csel x11, xzr, x21, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x11, x28, x11
+; GISEL-NEXT:    mov x21, x4
+; GISEL-NEXT:    orr x11, x19, x11
+; GISEL-NEXT:    ldp x12, x19, [sp, #64] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x12, eq
+; GISEL-NEXT:    csel x11, xzr, x24, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x11, x16, x11
+; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x20, eq
+; GISEL-NEXT:    csel x11, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #9
 ; GISEL-NEXT:    orr x11, x5, x11
+; GISEL-NEXT:    mov x5, x30
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x7, eq
+; GISEL-NEXT:    csel x11, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x11, x1, x11
-; GISEL-NEXT:    ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x4, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x30, eq
+; GISEL-NEXT:    csel x11, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x11, x6, x11
+; GISEL-NEXT:    orr x11, x26, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x25, eq
+; GISEL-NEXT:    csel x11, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x30, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    csel x10, x26, x10, eq
+; GISEL-NEXT:    csel x10, x16, x10, eq
 ; GISEL-NEXT:    cmp x9, #14
+; GISEL-NEXT:    ldr x16, [sp, #304] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, xzr, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
 ; GISEL-NEXT:    csel x11, xzr, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x11, x21, x11, eq
-; GISEL-NEXT:    ldp x10, x20, [x1, #112]
-; GISEL-NEXT:    str x11, [sp, #80] ; 8-byte Folded Spill
-; GISEL-NEXT:    ldp x11, x4, [sp, #40] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x10, x4, [x16, #112]
+; GISEL-NEXT:    csel x11, x25, x11, eq
+; GISEL-NEXT:    str x11, [sp, #120] ; 8-byte Folded Spill
+; GISEL-NEXT:    lsr x11, x25, x28
+; GISEL-NEXT:    and x16, x8, #0x3f
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    lsr x21, x21, x4
-; GISEL-NEXT:    lsl x28, x10, x11
-; GISEL-NEXT:    csel x1, xzr, x21, eq
-; GISEL-NEXT:    str x21, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldr x25, [sp, #88] ; 8-byte Folded Reload
+; GISEL-NEXT:    lsl x24, x10, x16
+; GISEL-NEXT:    csel x1, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    orr x1, x28, x1
-; GISEL-NEXT:    ldr x21, [sp, #72] ; 8-byte Folded Reload
-; GISEL-NEXT:    str x28, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT:    ldp x16, x28, [sp, #96] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x1, x24, x1
 ; GISEL-NEXT:    csel x1, x1, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    ldr x28, [sp, #56] ; 8-byte Folded Reload
-; GISEL-NEXT:    csel x30, xzr, x21, eq
+; GISEL-NEXT:    csel x30, xzr, x28, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x30, x28, x30
+; GISEL-NEXT:    orr x30, x25, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x22, eq
+; GISEL-NEXT:    csel x30, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    ldr x22, [sp, #64] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x30, x24, x30
+; GISEL-NEXT:    orr x30, x2, x30
+; GISEL-NEXT:    ldr x2, [sp, #56] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x22, eq
+; GISEL-NEXT:    csel x30, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #3
 ; GISEL-NEXT:    orr x30, x2, x30
-; GISEL-NEXT:    ldr x2, [sp, #280] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x2, eq
+; GISEL-NEXT:    csel x30, xzr, x27, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x30, x17, x30
-; GISEL-NEXT:    ldr x17, [sp, #224] ; 8-byte Folded Reload
+; GISEL-NEXT:    mov x27, x13
+; GISEL-NEXT:    orr x30, x0, x30
+; GISEL-NEXT:    ldr x0, [sp, #248] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x0, eq
+; GISEL-NEXT:    csel x30, xzr, x15, eq
+; GISEL-NEXT:    ldr x15, [sp, #312] ; 8-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #5
-; GISEL-NEXT:    orr x30, x17, x30
+; GISEL-NEXT:    orr x30, x15, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x14, eq
-; GISEL-NEXT:    ldr x14, [sp, #216] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x3, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x30, x14, x30
+; GISEL-NEXT:    ldr x3, [sp, #40] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x30, x13, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x13, eq
-; GISEL-NEXT:    ldr x13, [sp, #200] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x14, eq
+; GISEL-NEXT:    ldp x13, x14, [sp, #256] ; 16-byte Folded Reload
 ; GISEL-NEXT:    cmp x9, #7
 ; GISEL-NEXT:    orr x30, x13, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x15, eq
-; GISEL-NEXT:    ldr x15, [sp, #32] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x30, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #8
-; GISEL-NEXT:    orr x30, x15, x30
+; GISEL-NEXT:    orr x30, x3, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x12, eq
+; GISEL-NEXT:    csel x30, xzr, x0, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x30, x16, x30
+; GISEL-NEXT:    orr x30, x12, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x23, eq
+; GISEL-NEXT:    csel x30, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #10
-; GISEL-NEXT:    orr x30, x5, x30
+; GISEL-NEXT:    orr x30, x23, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x7, eq
+; GISEL-NEXT:    csel x30, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x30, x19, x30
+; GISEL-NEXT:    orr x30, x21, x30
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x27, eq
+; GISEL-NEXT:    csel x30, xzr, x20, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x30, x6, x30
+; GISEL-NEXT:    mov x20, x26
+; GISEL-NEXT:    orr x30, x26, x30
+; GISEL-NEXT:    mov x26, x5
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x30, xzr, x25, eq
+; GISEL-NEXT:    csel x30, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    orr x30, x3, x30
+; GISEL-NEXT:    orr x30, x5, x30
+; GISEL-NEXT:    ldr x5, [sp, #16] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x1, x30, x1, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    lsr x30, x10, x4
-; GISEL-NEXT:    csel x1, x26, x1, eq
+; GISEL-NEXT:    csel x1, x6, x1, eq
 ; GISEL-NEXT:    cmp x9, #15
+; GISEL-NEXT:    lsr x30, x10, x5
 ; GISEL-NEXT:    csel x1, xzr, x1, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    csel x26, x10, x1, eq
-; GISEL-NEXT:    lsl x10, x20, x11
+; GISEL-NEXT:    csel x5, x10, x1, eq
+; GISEL-NEXT:    and x10, x8, #0x3f
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x16, xzr, x30, eq
+; GISEL-NEXT:    lsl x10, x4, x10
+; GISEL-NEXT:    csel x1, xzr, x30, eq
 ; GISEL-NEXT:    cmp x9, #0
-; GISEL-NEXT:    ldr x11, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT:    orr x10, x10, x16
-; GISEL-NEXT:    ldr x16, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x29, x30, [sp, #416] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x10, x10, x1
+; GISEL-NEXT:    ldr x1, [sp, #296] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x10, xzr, eq
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #1
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #272] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x24, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x21, eq
+; GISEL-NEXT:    csel x11, xzr, x28, eq
 ; GISEL-NEXT:    cmp x9, #2
-; GISEL-NEXT:    orr x11, x28, x11
-; GISEL-NEXT:    ldp x29, x30, [sp, #400] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x25, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #16] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x11, eq
+; GISEL-NEXT:    csel x11, xzr, x7, eq
 ; GISEL-NEXT:    cmp x9, #3
-; GISEL-NEXT:    orr x11, x24, x11
+; GISEL-NEXT:    orr x11, x1, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x22, eq
+; GISEL-NEXT:    csel x11, xzr, x16, eq
 ; GISEL-NEXT:    cmp x9, #4
-; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldr x16, [sp, #232] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x16, [sp, #280] ; 8-byte Folded Reload
+; GISEL-NEXT:    orr x11, x2, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
+; GISEL-NEXT:    ldr x11, [sp, #240] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x2, eq
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #5
 ; GISEL-NEXT:    orr x11, x16, x11
-; GISEL-NEXT:    ldp x22, x21, [sp, #368] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
+; GISEL-NEXT:    ldr x11, [sp, #32] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x0, eq
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #6
-; GISEL-NEXT:    orr x11, x17, x11
+; GISEL-NEXT:    orr x11, x15, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #264] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #328] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #7
-; GISEL-NEXT:    orr x11, x14, x11
+; GISEL-NEXT:    orr x11, x27, x11
+; GISEL-NEXT:    ldp x28, x27, [sp, #336] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #320] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #8
 ; GISEL-NEXT:    orr x11, x13, x11
-; GISEL-NEXT:    ldr x13, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x13, [sp, #144] ; 8-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #24] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x11, eq
+; GISEL-NEXT:    csel x11, xzr, x14, eq
 ; GISEL-NEXT:    cmp x9, #9
-; GISEL-NEXT:    orr x11, x15, x11
+; GISEL-NEXT:    orr x11, x3, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    tst x8, #0x3f
-; GISEL-NEXT:    csel x11, xzr, x12, eq
-; GISEL-NEXT:    ldr x12, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x11, xzr, x0, eq
 ; GISEL-NEXT:    cmp x9, #10
 ; GISEL-NEXT:    orr x11, x12, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #192] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #232] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13]
-; GISEL-NEXT:    ldp x12, x11, [sp, #176] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #216] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #8]
-; GISEL-NEXT:    csel x11, xzr, x23, eq
+; GISEL-NEXT:    csel x11, xzr, x19, eq
 ; GISEL-NEXT:    cmp x9, #11
-; GISEL-NEXT:    orr x11, x5, x11
-; GISEL-NEXT:    ldp x24, x23, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x23, x11
+; GISEL-NEXT:    ldp x24, x23, [sp, #368] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #168] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #208] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #24]
-; GISEL-NEXT:    ldp x12, x11, [sp, #152] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #192] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #32]
-; GISEL-NEXT:    csel x11, xzr, x7, eq
+; GISEL-NEXT:    csel x11, xzr, x17, eq
 ; GISEL-NEXT:    cmp x9, #12
-; GISEL-NEXT:    orr x11, x19, x11
+; GISEL-NEXT:    orr x11, x21, x11
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #144] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #184] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #48]
-; GISEL-NEXT:    ldp x12, x11, [sp, #128] ; 16-byte Folded Reload
+; GISEL-NEXT:    ldp x12, x11, [sp, #168] ; 16-byte Folded Reload
 ; GISEL-NEXT:    stp x11, x12, [x13, #56]
-; GISEL-NEXT:    csel x11, xzr, x27, eq
+; GISEL-NEXT:    ldr x11, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x12, [sp, #136] ; 8-byte Folded Reload
+; GISEL-NEXT:    csel x11, xzr, x11, eq
 ; GISEL-NEXT:    cmp x9, #13
-; GISEL-NEXT:    orr x11, x6, x11
-; GISEL-NEXT:    ldp x28, x27, [sp, #320] ; 16-byte Folded Reload
+; GISEL-NEXT:    orr x11, x20, x11
+; GISEL-NEXT:    ldp x20, x19, [sp, #400] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
-; GISEL-NEXT:    ldr x11, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldr x11, [sp, #160] ; 8-byte Folded Reload
 ; GISEL-NEXT:    tst x8, #0x3f
 ; GISEL-NEXT:    str x11, [x13, #72]
-; GISEL-NEXT:    ldp x12, x11, [sp, #96] ; 16-byte Folded Reload
-; GISEL-NEXT:    stp x11, x12, [x13, #80]
-; GISEL-NEXT:    csel x11, xzr, x25, eq
+; GISEL-NEXT:    ldr x11, [sp, #152] ; 8-byte Folded Reload
+; GISEL-NEXT:    str x11, [x13, #80]
+; GISEL-NEXT:    csel x11, xzr, x22, eq
 ; GISEL-NEXT:    cmp x9, #14
-; GISEL-NEXT:    orr x11, x3, x11
+; GISEL-NEXT:    orr x11, x26, x11
+; GISEL-NEXT:    ldp x22, x21, [sp, #384] ; 16-byte Folded Reload
 ; GISEL-NEXT:    csel x10, x11, x10, eq
 ; GISEL-NEXT:    cmp x9, #15
-; GISEL-NEXT:    ldr x9, [sp, #288] ; 8-byte Folded Reload
-; GISEL-NEXT:    ldr x11, [sp, #88] ; 8-byte Folded Reload
-; GISEL-NEXT:    csel x9, x9, x10, eq
+; GISEL-NEXT:    ldr x9, [sp, #128] ; 8-byte Folded Reload
+; GISEL-NEXT:    ldp x26, x25, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT:    stp x12, x9, [x13, #88]
+; GISEL-NEXT:    csel x9, x6, x10, eq
 ; GISEL-NEXT:    cmp x8, #0
-; GISEL-NEXT:    ldr x8, [sp, #80] ; 8-byte Folded Reload
-; GISEL-NEXT:    stp x11, x8, [x13, #96]
-; GISEL-NEXT:    csel x8, x20, x9, eq
-; GISEL-NEXT:    stp x26, x8, [x13, #112]
-; GISEL-NEXT:    ldp x20, x19, [sp, #384] ; 16-byte Folded Reload
-; GISEL-NEXT:    ldp x26, x25, [sp, #336] ; 16-byte Folded Reload
-; GISEL-NEXT:    add sp, sp, #416
+; GISEL-NEXT:    ldr x8, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT:    stp x8, x5, [x13, #104]
+; GISEL-NEXT:    csel x8, x4, x9, eq
+; GISEL-NEXT:    str x8, [x13, #120]
+; GISEL-NEXT:    add sp, sp, #432
 ; GISEL-NEXT:    ret
 entry:
   %input_val = load i1024, ptr %input, align 128
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index cdde11042462b..19a766f0dee6d 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-SD-NEXT:    add x10, x2, #32
 ; CHECK-SD-NEXT:    add x11, x0, #16
-; CHECK-SD-NEXT:    mov x12, x9
+; CHECK-SD-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB3_4: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
@@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-GI-NEXT:    and x10, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    add x11, x2, #32
 ; CHECK-GI-NEXT:    add x12, x0, #16
-; CHECK-GI-NEXT:    mov x13, x10
+; CHECK-GI-NEXT:    and x13, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-GI-NEXT:  .LBB3_3: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-SD-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-SD-NEXT:    add x10, x2, #32
 ; CHECK-SD-NEXT:    add x11, x0, #16
-; CHECK-SD-NEXT:    mov x12, x9
+; CHECK-SD-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB4_4: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp q1, q2, [x11, #-16]
@@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
 ; CHECK-GI-NEXT:    and x8, x9, #0xfffffff0
 ; CHECK-GI-NEXT:    add x10, x2, #32
 ; CHECK-GI-NEXT:    add x11, x0, #16
-; CHECK-GI-NEXT:    mov x12, x8
+; CHECK-GI-NEXT:    and x12, x9, #0xfffffff0
 ; CHECK-GI-NEXT:  .LBB4_3: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    and w13, w1, #0xffff
@@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s2, w9
 ; CHECK-SD-NEXT:    add x8, x0, #8
-; CHECK-SD-NEXT:    mov x12, x11
+; CHECK-SD-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldp d3, d4, [x8, #-8]
@@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
 ; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-GI-NEXT:    add x10, x0, #8
+; CHECK-GI-NEXT:    and x11, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    sbfx w9, w9, #8, #8
 ; CHECK-GI-NEXT:    dup v2.8h, w9
 ; CHECK-GI-NEXT:    and x9, x8, #0xfffffff0
-; CHECK-GI-NEXT:    mov x11, x9
 ; CHECK-GI-NEXT:  .LBB5_5: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldp d3, d4, [x10, #-8]
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 4c8e589391c3a..c23e4e182f2ef 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
 ; CHECK-NEXT:    and x9, x8, #0xfffffff0
 ; CHECK-NEXT:    add x10, x1, #16
 ; CHECK-NEXT:    add x11, x0, #16
-; CHECK-NEXT:    mov x12, x9
+; CHECK-NEXT:    and x12, x8, #0xfffffff0
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q4, [x10, #-16]
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index f6bbdf5d95d87..1770bb9794f09 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    and x12, x10, #0xfffffff0
 ; CHECK-NEXT:    add x13, x1, #32
-; CHECK-NEXT:    add x14, x2, #16
 ; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
@@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
 ; CHECK-NEXT:    // =>This Loop Header: Depth=1
 ; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
 ; CHECK-NEXT:    // Child Loop BB0_9 Depth 2
-; CHECK-NEXT:    ldrsh w15, [x2, x9, lsl #1]
+; CHECK-NEXT:    ldrsh w14, [x2, x9, lsl #1]
 ; CHECK-NEXT:    cmp w0, #16
 ; CHECK-NEXT:    b.hs .LBB0_5
 ; CHECK-NEXT:  // %bb.4: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    mov x18, xzr
+; CHECK-NEXT:    mov x17, xzr
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:  .LBB0_5: // %vector.ph
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    dup v0.8h, w15
-; CHECK-NEXT:    mov x16, x14
-; CHECK-NEXT:    mov x17, x13
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    dup v0.8h, w14
+; CHECK-NEXT:    add x15, x2, #16
+; CHECK-NEXT:    mov x16, x13
+; CHECK-NEXT:    and x17, x10, #0xfffffff0
 ; CHECK-NEXT:  .LBB0_6: // %vector.body
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldp q1, q4, [x16, #-16]
-; CHECK-NEXT:    subs x18, x18, #16
-; CHECK-NEXT:    ldp q3, q2, [x17, #-32]
-; CHECK-NEXT:    add x16, x16, #32
-; CHECK-NEXT:    ldp q6, q5, [x17]
+; CHECK-NEXT:    ldp q1, q4, [x15, #-16]
+; CHECK-NEXT:    subs x17, x17, #16
+; CHECK-NEXT:    ldp q3, q2, [x16, #-32]
+; CHECK-NEXT:    add x15, x15, #32
+; CHECK-NEXT:    ldp q6, q5, [x16]
 ; CHECK-NEXT:    smlal2 v2.4s, v0.8h, v1.8h
 ; CHECK-NEXT:    smlal v3.4s, v0.4h, v1.4h
 ; CHECK-NEXT:    smlal2 v5.4s, v0.8h, v4.8h
 ; CHECK-NEXT:    smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT:    stp q3, q2, [x17, #-32]
-; CHECK-NEXT:    stp q6, q5, [x17], #64
+; CHECK-NEXT:    stp q3, q2, [x16, #-32]
+; CHECK-NEXT:    stp q6, q5, [x16], #64
 ; CHECK-NEXT:    b.ne .LBB0_6
 ; CHECK-NEXT:  // %bb.7: // %middle.block
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    cmp x12, x10
-; CHECK-NEXT:    mov x18, x12
+; CHECK-NEXT:    and x17, x10, #0xfffffff0
 ; CHECK-NEXT:    b.eq .LBB0_2
 ; CHECK-NEXT:  .LBB0_8: // %for.body4.us.preheader
 ; CHECK-NEXT:    // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    add x16, x18, x8
-; CHECK-NEXT:    add x17, x2, x18, lsl #1
-; CHECK-NEXT:    sub x18, x10, x18
-; CHECK-NEXT:    add x16, x1, x16, lsl #2
+; CHECK-NEXT:    add x15, x17, x8
+; CHECK-NEXT:    add x16, x2, x17, lsl #1
+; CHECK-NEXT:    sub x17, x10, x17
+; CHECK-NEXT:    add x15, x1, x15, lsl #2
 ; CHECK-NEXT:  .LBB0_9: // %for.body4.us
 ; CHECK-NEXT:    // Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrsh w3, [x17], #2
-; CHECK-NEXT:    ldr w4, [x16]
-; CHECK-NEXT:    subs x18, x18, #1
-; CHECK-NEXT:    madd w3, w3, w15, w4
-; CHECK-NEXT:    str w3, [x16], #4
+; CHECK-NEXT:    ldrsh w18, [x16], #2
+; CHECK-NEXT:    ldr w3, [x15]
+; CHECK-NEXT:    subs x17, x17, #1
+; CHECK-NEXT:    madd w18, w18, w14, w3
+; CHECK-NEXT:    str w18, [x15], #4
 ; CHECK-NEXT:    b.ne .LBB0_9
 ; CHECK-NEXT:    b .LBB0_2
 ; CHECK-NEXT:  .LBB0_10: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 3caac1d13495d..74b0e69d1b05b 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) {
 ; CHECK-GI-NEXT:    .cfi_offset w19, -8
 ; CHECK-GI-NEXT:    .cfi_offset w20, -16
 ; CHECK-GI-NEXT:    .cfi_offset w30, -32
-; CHECK-GI-NEXT:    and x20, x0, #0x3
 ; CHECK-GI-NEXT:    mov x19, x0
-; CHECK-GI-NEXT:    mov x0, x20
+; CHECK-GI-NEXT:    and x20, x0, #0x3
+; CHECK-GI-NEXT:    and x0, x0, #0x3
 ; CHECK-GI-NEXT:    bl callee
 ; CHECK-GI-NEXT:    tst x19, #0x3
 ; CHECK-GI-NEXT:    csel x0, x20, x0, eq
diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
index e0f21550cd8e3..58c01db032a8c 100644
--- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
+++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
@@ -7,20 +7,16 @@
 define void @foo(i64 %v1, i64 %v2, ptr %ptr) {
 ; CHECK-LABEL: foo:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    add x3, x0, x1
-; CHECK-NEXT:    str x3, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    str x3, [x2, #8]
 ; CHECK-NEXT:    ldr x3, [x2, #16]
 ; CHECK-NEXT:    add x3, x0, x3
 ; CHECK-NEXT:    sub x3, x3, x1
 ; CHECK-NEXT:    str x3, [x2, #16]
-; CHECK-NEXT:    ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT:    add x3, x0, x1
 ; CHECK-NEXT:    str x3, [x2, #24]
 ; CHECK-NEXT:    str x0, [x2, #32]
 ; CHECK-NEXT:    str x1, [x2, #40]
-; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %v3 = add i64 %v1, %v2
   %p1 = getelementptr i64, ptr %ptr, i64 1
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 5fc996ad921ff..0f629971b5844 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    add x13, x1, #16
 ; CHECK-NEXT:    add x8, x1, x10, lsl #2
 ; CHECK-NEXT:    add x9, x0, x10
-; CHECK-NEXT:    mov x14, x10
+; CHECK-NEXT:    and x14, x11, #0x1fffffff8
 ; CHECK-NEXT:  .LBB0_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q1, q2, [x13, #-16]
@@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    mov w8, #1132396544 // =0x437f0000
 ; CHECK-NEXT:    and x10, x11, #0x1fffffffc
 ; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:    add x8, x1, x10, lsl #3
 ; CHECK-NEXT:    add x9, x0, x10, lsl #1
-; CHECK-NEXT:    mov x12, x10
 ; CHECK-NEXT:  .LBB1_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld2 { v1.4s, v2.4s }, [x1], #32
@@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    dup v0.4s, w8
 ; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI2_0]
 ; CHECK-NEXT:    add x9, x10, x10, lsl #1
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:    add x8, x1, x9, lsl #2
 ; CHECK-NEXT:    add x9, x0, x9
 ; CHECK-NEXT:  .LBB2_4: // %vector.body
@@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    ldr q1, [x12, :lo12:.LCPI3_0]
 ; CHECK-NEXT:    add x8, x1, x10, lsl #4
 ; CHECK-NEXT:    add x9, x0, x10, lsl #2
-; CHECK-NEXT:    mov x12, x10
+; CHECK-NEXT:    and x12, x11, #0x1fffffffc
 ; CHECK-NEXT:  .LBB3_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index ca5d089443542..8ca4c4320987e 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    sub.w r7, r2, #32
-; CHECK-NEXT:    mov r8, r0
+; CHECK-NEXT:    sub.w r8, r2, #32
+; CHECK-NEXT:    mov r6, r0
 ; CHECK-NEXT:    movs r0, #1
 ; CHECK-NEXT:    mov r4, r2
-; CHECK-NEXT:    add.w r6, r0, r7, lsr #5
+; CHECK-NEXT:    add.w r7, r0, r8, lsr #5
 ; CHECK-NEXT:    mov r5, r1
 ; CHECK-NEXT:    mov.w r9, #0
 ; CHECK-NEXT:    b .LBB0_2
@@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
 ; CHECK-NEXT:    mov r2, r4
 ; CHECK-NEXT:    cmp r4, #31
 ; CHECK-NEXT:    ldr r0, [r1, #16]
-; CHECK-NEXT:    add.w r0, r0, r6, lsl #2
+; CHECK-NEXT:    add.w r0, r0, r7, lsl #2
 ; CHECK-NEXT:    ldr r0, [r0, #40]
 ; CHECK-NEXT:    it hi
-; CHECK-NEXT:    andhi r2, r7, #31
+; CHECK-NEXT:    andhi r2, r8, #31
 ; CHECK-NEXT:    lsrs r0, r2
 ; CHECK-NEXT:    lsls r0, r0, #31
 ; CHECK-NEXT:    beq .LBB0_1
 ; CHECK-NEXT:  @ %bb.3: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    mov r0, r8
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:    bl foo
 ; CHECK-NEXT:    str.w r9, [r5, #4]
 ; CHECK-NEXT:    b .LBB0_1
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index e761d3a9a75c3..33b89a405d8e3 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a2, a2, 1
 ; NOREMAT-NEXT:    sub sp, sp, a2
 ; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT:    mv a7, a0
-; NOREMAT-NEXT:    li a0, 32
-; NOREMAT-NEXT:    addi a5, a7, 512
-; NOREMAT-NEXT:    addi a4, a7, 1024
-; NOREMAT-NEXT:    addi a6, a7, 1536
-; NOREMAT-NEXT:    li t4, 1
-; NOREMAT-NEXT:    li a2, 5
+; NOREMAT-NEXT:    li a7, 32
+; NOREMAT-NEXT:    addi s10, a0, 512
+; NOREMAT-NEXT:    addi a4, a0, 1024
+; NOREMAT-NEXT:    addi a6, a0, 1536
+; NOREMAT-NEXT:    li t0, 1
+; NOREMAT-NEXT:    li a3, 5
 ; NOREMAT-NEXT:    li t1, 3
-; NOREMAT-NEXT:    li t0, 7
-; NOREMAT-NEXT:    lui t5, 1
+; NOREMAT-NEXT:    li a2, 7
+; NOREMAT-NEXT:    lui t2, 1
 ; NOREMAT-NEXT:    li s4, 9
 ; NOREMAT-NEXT:    li s6, 11
 ; NOREMAT-NEXT:    li s9, 13
 ; NOREMAT-NEXT:    li ra, 15
-; NOREMAT-NEXT:    lui t2, 2
+; NOREMAT-NEXT:    lui a5, 2
 ; NOREMAT-NEXT:    lui s1, 3
 ; NOREMAT-NEXT:    lui t3, 4
 ; NOREMAT-NEXT:    lui s0, 5
 ; NOREMAT-NEXT:    lui s3, 6
 ; NOREMAT-NEXT:    lui s7, 7
-; NOREMAT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT:    slli t4, t4, 11
-; NOREMAT-NEXT:    sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    slli a3, a2, 9
-; NOREMAT-NEXT:    sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT:    slli t0, t0, 11
+; NOREMAT-NEXT:    sd t0, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli t4, a3, 9
+; NOREMAT-NEXT:    sd t4, 504(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    slli t6, t1, 10
-; NOREMAT-NEXT:    slli s2, t0, 9
-; NOREMAT-NEXT:    add a0, a7, t5
+; NOREMAT-NEXT:    slli s2, a2, 9
+; NOREMAT-NEXT:    add a7, a0, t2
 ; NOREMAT-NEXT:    lui s11, 1
 ; NOREMAT-NEXT:    slli s4, s4, 9
-; NOREMAT-NEXT:    slli s5, a2, 10
+; NOREMAT-NEXT:    slli s5, a3, 10
 ; NOREMAT-NEXT:    slli s6, s6, 9
 ; NOREMAT-NEXT:    slli s8, t1, 11
-; NOREMAT-NEXT:    vle32.v v8, (a5)
+; NOREMAT-NEXT:    vle32.v v8, (s10)
 ; NOREMAT-NEXT:    slli s9, s9, 9
 ; NOREMAT-NEXT:    li t5, 13
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    slli s10, t0, 10
+; NOREMAT-NEXT:    slli s10, a2, 10
 ; NOREMAT-NEXT:    vle32.v v0, (a6)
 ; NOREMAT-NEXT:    vle32.v v12, (a6)
 ; NOREMAT-NEXT:    slli ra, ra, 9
-; NOREMAT-NEXT:    vle32.v v4, (a0)
-; NOREMAT-NEXT:    vle32.v v20, (a0)
-; NOREMAT-NEXT:    add a4, a7, t2
+; NOREMAT-NEXT:    vle32.v v4, (a7)
+; NOREMAT-NEXT:    vle32.v v20, (a7)
+; NOREMAT-NEXT:    add a4, a0, a5
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    add a4, a7, s1
+; NOREMAT-NEXT:    add a4, a0, s1
 ; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v26, (a4)
-; NOREMAT-NEXT:    add a4, a7, t3
+; NOREMAT-NEXT:    add a4, a0, t3
 ; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v22, (a4)
-; NOREMAT-NEXT:    add a4, a7, s0
-; NOREMAT-NEXT:    vle32.v v14, (a7)
+; NOREMAT-NEXT:    add a4, a0, s0
+; NOREMAT-NEXT:    vle32.v v14, (a0)
 ; NOREMAT-NEXT:    vle32.v v18, (a4)
 ; NOREMAT-NEXT:    vle32.v v16, (a4)
-; NOREMAT-NEXT:    add a4, a7, s3
+; NOREMAT-NEXT:    add a4, a0, s3
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v8
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, t4
+; NOREMAT-NEXT:    addi a4, sp, 640
+; NOREMAT-NEXT:    vs2r.v v8, (a4) # vscale x 16-byte Folded Spill
+; NOREMAT-NEXT:    add a4, a0, t0
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, a3
+; NOREMAT-NEXT:    add a4, a0, t4
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v10
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, t6
+; NOREMAT-NEXT:    add a4, a0, t6
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v0
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
-; NOREMAT-NEXT:    add a4, a7, s2
+; NOREMAT-NEXT:    add a4, a0, s2
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s7
+; NOREMAT-NEXT:    add a4, a0, s7
 ; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v8
 ; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    add a4, a7, s4
+; NOREMAT-NEXT:    add a4, a0, s4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s5
+; NOREMAT-NEXT:    add a4, a0, s5
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v8
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s6
+; NOREMAT-NEXT:    add a4, a0, s6
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s8
+; NOREMAT-NEXT:    add a4, a0, s8
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, s9
+; NOREMAT-NEXT:    add a4, a0, s9
 ; NOREMAT-NEXT:    vle32.v v20, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    add a4, a7, s10
+; NOREMAT-NEXT:    add a4, a0, s10
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v20
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    add a4, a7, ra
+; NOREMAT-NEXT:    add a4, a0, ra
 ; NOREMAT-NEXT:    vle32.v v2, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; NOREMAT-NEXT:    lui t4, 8
-; NOREMAT-NEXT:    add a5, a7, t4
+; NOREMAT-NEXT:    add a5, a0, t4
 ; NOREMAT-NEXT:    vle32.v v20, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v2
@@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a4, a4, 9
 ; NOREMAT-NEXT:    li s1, 17
 ; NOREMAT-NEXT:    sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v4, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v6
 ; NOREMAT-NEXT:    li a5, 9
 ; NOREMAT-NEXT:    slli a4, a5, 10
 ; NOREMAT-NEXT:    sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
 ; NOREMAT-NEXT:    vle32.v v6, (a4)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
@@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    slli a4, a4, 9
 ; NOREMAT-NEXT:    li t2, 19
 ; NOREMAT-NEXT:    sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a7, a4
+; NOREMAT-NEXT:    add a4, a0, a4
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
 ; NOREMAT-NEXT:    vle32.v v30, (a4)
-; NOREMAT-NEXT:    slli a3, a2, 11
+; NOREMAT-NEXT:    slli a3, a3, 11
 ; NOREMAT-NEXT:    sd a3, 600(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li s7, 21
 ; NOREMAT-NEXT:    slli a3, s7, 9
 ; NOREMAT-NEXT:    sd a3, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
 ; NOREMAT-NEXT:    li a6, 11
 ; NOREMAT-NEXT:    slli a3, a6, 10
 ; NOREMAT-NEXT:    sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
 ; NOREMAT-NEXT:    li s3, 23
 ; NOREMAT-NEXT:    slli a3, s3, 9
 ; NOREMAT-NEXT:    sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    li s0, 25
 ; NOREMAT-NEXT:    slli a3, s0, 9
 ; NOREMAT-NEXT:    sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    vle32.v v6, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    slli a3, t5, 10
 ; NOREMAT-NEXT:    sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
 ; NOREMAT-NEXT:    vle32.v v30, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v28
 ; NOREMAT-NEXT:    li t3, 27
 ; NOREMAT-NEXT:    slli a3, t3, 9
 ; NOREMAT-NEXT:    sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a7, a3
+; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v28, (a3)
 ; NOREMAT-NEXT:    vle32.v v4, (a3)
-; NOREMAT-NEXT:    slli a2, t0, 11
+; NOREMAT-NEXT:    slli a2, a2, 11
 ; NOREMAT-NEXT:    sd a2, 544(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
 ; NOREMAT-NEXT:    li t0, 29
 ; NOREMAT-NEXT:    slli a2, t0, 9
 ; NOREMAT-NEXT:    sd a2, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT:    li a3, 15
-; NOREMAT-NEXT:    slli a2, a3, 10
+; NOREMAT-NEXT:    li a7, 15
+; NOREMAT-NEXT:    slli a2, a7, 10
 ; NOREMAT-NEXT:    sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v12
 ; NOREMAT-NEXT:    li t1, 31
 ; NOREMAT-NEXT:    slli a2, t1, 9
 ; NOREMAT-NEXT:    sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT:    lui a4, 4
-; NOREMAT-NEXT:    addi a0, a4, 512
-; NOREMAT-NEXT:    sd a0, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
-; NOREMAT-NEXT:    vle32.v v8, (a0)
-; NOREMAT-NEXT:    vle32.v v26, (a0)
+; NOREMAT-NEXT:    lui a3, 4
+; NOREMAT-NEXT:    addi a2, a3, 512
+; NOREMAT-NEXT:    sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v28
 ; NOREMAT-NEXT:    slli a2, s1, 10
 ; NOREMAT-NEXT:    sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT:    addi a2, a4, 1536
+; NOREMAT-NEXT:    addi a2, a3, 1536
 ; NOREMAT-NEXT:    sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    lui a4, 4
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, a5, 11
 ; NOREMAT-NEXT:    sd a2, 472(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v8
 ; NOREMAT-NEXT:    lui a5, 5
 ; NOREMAT-NEXT:    addi a2, a5, -1536
 ; NOREMAT-NEXT:    sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v28
 ; NOREMAT-NEXT:    slli a2, t2, 10
 ; NOREMAT-NEXT:    sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li t2, 19
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    li a3, 19
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v12
 ; NOREMAT-NEXT:    addi a2, a5, -512
 ; NOREMAT-NEXT:    sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
 ; NOREMAT-NEXT:    addi a2, a5, 512
 ; NOREMAT-NEXT:    sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    slli a2, s7, 10
 ; NOREMAT-NEXT:    sd a2, 432(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
 ; NOREMAT-NEXT:    addi a2, a5, 1536
 ; NOREMAT-NEXT:    sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    slli a2, a6, 11
 ; NOREMAT-NEXT:    sd a2, 416(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v18
 ; NOREMAT-NEXT:    lui a6, 6
 ; NOREMAT-NEXT:    addi a2, a6, -1536
 ; NOREMAT-NEXT:    sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    slli a2, s3, 10
 ; NOREMAT-NEXT:    sd a2, 400(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v8
 ; NOREMAT-NEXT:    addi a2, a6, -512
 ; NOREMAT-NEXT:    sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v22
 ; NOREMAT-NEXT:    addi a2, a6, 512
 ; NOREMAT-NEXT:    sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, s0, 10
 ; NOREMAT-NEXT:    sd a2, 376(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v18
 ; NOREMAT-NEXT:    addi a2, a6, 1536
 ; NOREMAT-NEXT:    sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
 ; NOREMAT-NEXT:    slli a2, t5, 11
 ; NOREMAT-NEXT:    sd a2, 360(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v8
 ; NOREMAT-NEXT:    lui s0, 7
 ; NOREMAT-NEXT:    addi a2, s0, -1536
 ; NOREMAT-NEXT:    sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t3, 10
 ; NOREMAT-NEXT:    sd a2, 344(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, sp, 640
-; NOREMAT-NEXT:    vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; NOREMAT-NEXT:    addi a2, sp, 640
+; NOREMAT-NEXT:    vl2r.v v12, (a2) # vscale x 16-byte Folded Reload
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v22
 ; NOREMAT-NEXT:    addi a2, s0, -512
 ; NOREMAT-NEXT:    sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v26
 ; NOREMAT-NEXT:    addi a2, s0, 512
 ; NOREMAT-NEXT:    sd a2, 328(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    lui t3, 7
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v26, (a2)
 ; NOREMAT-NEXT:    vle32.v v4, (a2)
 ; NOREMAT-NEXT:    slli a2, t0, 10
 ; NOREMAT-NEXT:    sd a2, 320(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v18, (a2)
 ; NOREMAT-NEXT:    vle32.v v2, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v16
 ; NOREMAT-NEXT:    addi a2, t3, 1536
 ; NOREMAT-NEXT:    sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v16, (a2)
 ; NOREMAT-NEXT:    vle32.v v28, (a2)
-; NOREMAT-NEXT:    slli a2, a3, 11
+; NOREMAT-NEXT:    slli a2, a7, 11
 ; NOREMAT-NEXT:    sd a2, 304(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    vle32.v v6, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v14
 ; NOREMAT-NEXT:    addi a2, t4, -1536
 ; NOREMAT-NEXT:    sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    vle32.v v24, (a2)
 ; NOREMAT-NEXT:    slli a2, t1, 10
 ; NOREMAT-NEXT:    sd a2, 288(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT:    add a2, a7, a2
+; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v22, (a2)
 ; NOREMAT-NEXT:    vle32.v v30, (a2)
-; NOREMAT-NEXT:    addi a0, t4, -512
-; NOREMAT-NEXT:    sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a0, a7, a0
+; NOREMAT-NEXT:    addi a2, t4, -512
+; NOREMAT-NEXT:    sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a0, a0, a2
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; NOREMAT-NEXT:    vle32.v v12, (a0)
 ; NOREMAT-NEXT:    vle32.v v0, (a0)
@@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    addi s11, a0, 512
 ; NOREMAT-NEXT:    addi s7, a0, 1024
 ; NOREMAT-NEXT:    addi s3, a0, 1536
-; NOREMAT-NEXT:    slli s1, t2, 11
+; NOREMAT-NEXT:    slli s1, a3, 11
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addi t2, a0, -1536
 ; NOREMAT-NEXT:    addi a7, a0, -1024
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 6f986ce28381b..c418038b751d7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur
 ; CHECK-NEXT:    cbz r2, .LBB7_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    adds r3, r2, #7
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    bic r3, r3, #7
 ; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    bic r3, r3, #7
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    subs r3, #8
-; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    add.w r12, r4, r3, lsr #3
 ; CHECK-NEXT:    mov r3, r0
 ; CHECK-NEXT:    mov r4, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index 40207090fda66..fe06601e77bb3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    ldrsh.w r7, [r2]
 ; CHECK-NEXT:    cmp r7, #1
-; CHECK-NEXT:    blt.w .LBB0_6
+; CHECK-NEXT:    blt .LBB0_6
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond3.preheader.us.preheader
-; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    ldr r4, [sp, #152]
+; CHECK-NEXT:    movs r2, #252
 ; CHECK-NEXT:    and.w r6, r2, r3, lsr #3
 ; CHECK-NEXT:    movs r2, #120
 ; CHECK-NEXT:    and.w r5, r2, r3, lsr #9
 ; CHECK-NEXT:    lsls r3, r3, #3
-; CHECK-NEXT:    uxtb r3, r3
 ; CHECK-NEXT:    muls r6, r4, r6
+; CHECK-NEXT:    uxtb r3, r3
 ; CHECK-NEXT:    rsb.w r2, r4, #256
-; CHECK-NEXT:    vmov.i16 q2, #0xfc
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
+; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    mul lr, r5, r4
-; CHECK-NEXT:    vdup.16 q4, r6
 ; CHECK-NEXT:    mov.w r6, #2016
-; CHECK-NEXT:    vmov.i16 q6, #0xf8
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    mul r5, r3, r4
 ; CHECK-NEXT:    adds r3, r7, #7
+; CHECK-NEXT:    vdup.16 q0, r6
 ; CHECK-NEXT:    bic r3, r3, #7
-; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vdup.16 q0, r5
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vdup.16 q0, lr
 ; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    vdup.16 q0, r5
-; CHECK-NEXT:    lsls r1, r1, #1
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    add.w r3, r4, r3, lsr #3
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0xf800
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    lsls r1, r1, #1
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vdup.16 q5, r6
-; CHECK-NEXT:    vmov.i16 q7, #0x78
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i16 q4, #0xf8
 ; CHECK-NEXT:  .LBB0_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB0_4 Depth 2
@@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
 ; CHECK-NEXT:    @ Parent Loop BB0_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.16 r6
-; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    vmov.i16 q5, #0xf800
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrht.u16 q0, [r5]
-; CHECK-NEXT:    vshr.u16 q1, q0, #3
-; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmla.i16 q2, q1, r2
-; CHECK-NEXT:    vshr.u16 q1, q2, #5
-; CHECK-NEXT:    vshl.i16 q2, q0, #3
-; CHECK-NEXT:    vand q3, q1, q5
-; CHECK-NEXT:    vmov q1, q7
-; CHECK-NEXT:    vand q2, q2, q6
-; CHECK-NEXT:    vmov q7, q6
-; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    subs r6, #8
+; CHECK-NEXT:    vshr.u16 q3, q0, #3
+; CHECK-NEXT:    vand q3, q3, q1
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmla.i16 q1, q3, r2
+; CHECK-NEXT:    vshl.i16 q3, q0, #3
+; CHECK-NEXT:    vand q3, q3, q4
+; CHECK-NEXT:    vmov q4, q6
+; CHECK-NEXT:    vshr.u16 q1, q1, #5
+; CHECK-NEXT:    vmla.i16 q4, q3, r2
+; CHECK-NEXT:    vshr.u16 q3, q4, #11
+; CHECK-NEXT:    vand q1, q1, q7
+; CHECK-NEXT:    vorr q1, q1, q3
 ; CHECK-NEXT:    vshr.u16 q0, q0, #9
-; CHECK-NEXT:    vmla.i16 q4, q2, r2
-; CHECK-NEXT:    vshr.u16 q2, q4, #11
-; CHECK-NEXT:    vmov q4, q5
-; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    vmov q6, q7
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vorr q1, q3, q2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vand q0, q0, q7
-; CHECK-NEXT:    vmla.i16 q2, q0, r2
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vand q0, q2, q0
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i16 q3, #0x78
+; CHECK-NEXT:    vmov.i16 q4, #0xf8
+; CHECK-NEXT:    vand q0, q0, q3
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmla.i16 q3, q0, r2
+; CHECK-NEXT:    vand q0, q3, q5
 ; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vstrht.16 q0, [r5], #16
 ; CHECK-NEXT:    le lr, .LBB0_4
@@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    ldrsh.w r12, [r2, #2]
 ; CHECK-NEXT:    cmp.w r12, #1
-; CHECK-NEXT:    blt.w .LBB1_7
+; CHECK-NEXT:    blt .LBB1_7
 ; CHECK-NEXT:  @ %bb.1: @ %for.cond3.preheader.lr.ph
 ; CHECK-NEXT:    ldrsh.w r2, [r2]
 ; CHECK-NEXT:    cmp r2, #1
@@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    sub sp, #80
-; CHECK-NEXT:    ldr r7, [sp, #168]
+; CHECK-NEXT:    ldr r7, [sp, #88]
 ; CHECK-NEXT:    movs r5, #120
 ; CHECK-NEXT:    lsls r6, r3, #3
 ; CHECK-NEXT:    movs r4, #252
 ; CHECK-NEXT:    and.w r5, r5, r3, lsr #9
 ; CHECK-NEXT:    uxtb r6, r6
 ; CHECK-NEXT:    and.w r3, r4, r3, lsr #3
+; CHECK-NEXT:    adds r4, r2, #7
 ; CHECK-NEXT:    muls r6, r7, r6
+; CHECK-NEXT:    bic r4, r4, #7
 ; CHECK-NEXT:    mul lr, r3, r7
-; CHECK-NEXT:    vdup.16 q0, r6
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, lr
 ; CHECK-NEXT:    muls r5, r7, r5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q0, #0xfc
-; CHECK-NEXT:    mov.w r6, #2016
-; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, r5
 ; CHECK-NEXT:    rsb.w r3, r7, #256
 ; CHECK-NEXT:    lsls r7, r1, #1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vdup.16 q0, r6
+; CHECK-NEXT:    sub.w r1, r4, #8
+; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    vmov.i16 q2, #0xf8
-; CHECK-NEXT:    vmov.i16 q5, #0x78
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q6, #0xf800
+; CHECK-NEXT:    add.w r1, r4, r1, lsr #3
+; CHECK-NEXT:    vdup.16 q6, r6
+; CHECK-NEXT:    mov.w r6, #2016
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vdup.16 q3, lr
+; CHECK-NEXT:    vdup.16 q5, r5
+; CHECK-NEXT:    vdup.16 q7, r6
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB1_3: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_4 Depth 2
 ; CHECK-NEXT:    mov r5, r0
-; CHECK-NEXT:    dlstp.16 lr, r2
+; CHECK-NEXT:    mov r6, r2
+; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  .LBB1_4: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vldrh.u16 q0, [r5]
+; CHECK-NEXT:    vctp.16 r6
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrht.u16 q0, [r5]
 ; CHECK-NEXT:    vshl.i16 q1, q0, #3
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    subs r6, #8
 ; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmla.i16 q3, q1, r3
-; CHECK-NEXT:    vmov.f64 d8, d4
-; CHECK-NEXT:    vmov.f64 d9, d5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vshr.u16 q2, q0, #9
+; CHECK-NEXT:    vmov.i16 q2, #0x78
+; CHECK-NEXT:    vshr.u16 q4, q0, #9
+; CHECK-NEXT:    vand q4, q4, q2
+; CHECK-NEXT:    vmov q2, q6
+; CHECK-NEXT:    vmla.i16 q2, q1, r3
 ; CHECK-NEXT:    vshr.u16 q0, q0, #3
+; CHECK-NEXT:    vmov.i16 q1, #0xfc
 ; CHECK-NEXT:    vand q0, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov q1, q3
 ; CHECK-NEXT:    vmla.i16 q1, q0, r3
-; CHECK-NEXT:    vand q2, q2, q5
-; CHECK-NEXT:    vshr.u16 q0, q3, #11
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vshr.u16 q0, q2, #11
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vmla.i16 q2, q4, r3
 ; CHECK-NEXT:    vshr.u16 q1, q1, #5
-; CHECK-NEXT:    vmla.i16 q3, q2, r3
+; CHECK-NEXT:    vmov.i16 q4, #0xf800
 ; CHECK-NEXT:    vand q1, q1, q7
 ; CHECK-NEXT:    vorr q0, q1, q0
-; CHECK-NEXT:    vand q1, q3, q6
+; CHECK-NEXT:    vand q1, q2, q4
+; CHECK-NEXT:    vmov.i16 q2, #0xf8
 ; CHECK-NEXT:    vorr q0, q0, q1
-; CHECK-NEXT:    vstrh.16 q0, [r5], #16
-; CHECK-NEXT:    vmov.f64 d4, d8
-; CHECK-NEXT:    vmov.f64 d5, d9
-; CHECK-NEXT:    letp lr, .LBB1_4
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vstrht.16 q0, [r5], #16
+; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
 ; CHECK-NEXT:    @ in Loop: Header=BB1_3 Depth=1
 ; CHECK-NEXT:    adds r4, #1
@@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
 ; CHECK-NEXT:    cmp r4, r12
 ; CHECK-NEXT:    bne .LBB1_3
 ; CHECK-NEXT:  @ %bb.6:
-; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 07c06e10979cd..1769c5d2fd385 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -17,17 +17,16 @@
 define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
 ; ENABLED-LABEL: varying_outer_2d_reduction:
 ; ENABLED:       @ %bb.0: @ %entry
-; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; ENABLED-NEXT:    sub sp, #4
 ; ENABLED-NEXT:    cmp r3, #1
-; ENABLED-NEXT:    str r0, [sp] @ 4-byte Spill
-; ENABLED-NEXT:    blt .LBB0_8
-; ENABLED-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; ENABLED-NEXT:    ldr r0, [sp, #36]
-; ENABLED-NEXT:    add.w r12, r2, #3
-; ENABLED-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT:    mov.w r8, #0
-; ENABLED-NEXT:    mov r9, r12
+; ENABLED-NEXT:    it lt
+; ENABLED-NEXT:    bxlt lr
+; ENABLED-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; ENABLED-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    mov r11, r0
+; ENABLED-NEXT:    ldr r0, [sp, #32]
+; ENABLED-NEXT:    add.w r9, r2, #3
+; ENABLED-NEXT:    mov.w r12, #0
+; ENABLED-NEXT:    mov r10, r11
 ; ENABLED-NEXT:    uxth r0, r0
 ; ENABLED-NEXT:    rsbs r5, r0, #0
 ; ENABLED-NEXT:    b .LBB0_4
@@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    lsrs r0, r0, #16
 ; ENABLED-NEXT:    sub.w r9, r9, #1
-; ENABLED-NEXT:    strh.w r0, [r1, r8, lsl #1]
-; ENABLED-NEXT:    add.w r8, r8, #1
+; ENABLED-NEXT:    strh.w r0, [r1, r12, lsl #1]
+; ENABLED-NEXT:    add.w r12, r12, #1
 ; ENABLED-NEXT:    add.w r10, r10, #2
-; ENABLED-NEXT:    cmp r8, r3
+; ENABLED-NEXT:    cmp r12, r3
 ; ENABLED-NEXT:    beq .LBB0_8
 ; ENABLED-NEXT:  .LBB0_4: @ %for.body
 ; ENABLED-NEXT:    @ =>This Loop Header: Depth=1
 ; ENABLED-NEXT:    @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT:    cmp r2, r8
+; ENABLED-NEXT:    cmp r2, r12
 ; ENABLED-NEXT:    ble .LBB0_2
 ; ENABLED-NEXT:  @ %bb.5: @ %vector.ph
 ; ENABLED-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; ENABLED-NEXT:    bic r0, r9, #3
 ; ENABLED-NEXT:    movs r7, #1
 ; ENABLED-NEXT:    subs r0, #4
-; ENABLED-NEXT:    sub.w r4, r2, r8
+; ENABLED-NEXT:    sub.w r4, r2, r12
 ; ENABLED-NEXT:    vmov.i32 q1, #0x0
 ; ENABLED-NEXT:    add.w r6, r7, r0, lsr #2
-; ENABLED-NEXT:    sub.w r0, r12, r8
+; ENABLED-NEXT:    adds r0, r2, #3
+; ENABLED-NEXT:    sub.w r0, r0, r12
 ; ENABLED-NEXT:    bic r0, r0, #3
 ; ENABLED-NEXT:    subs r0, #4
 ; ENABLED-NEXT:    add.w r0, r7, r0, lsr #2
 ; ENABLED-NEXT:    mov r7, r10
 ; ENABLED-NEXT:    dls lr, r0
-; ENABLED-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; ENABLED-NEXT:    mov r0, r11
 ; ENABLED-NEXT:  .LBB0_6: @ %vector.body
 ; ENABLED-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; ENABLED-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; ENABLED-NEXT:    vpsel q0, q1, q0
 ; ENABLED-NEXT:    vaddv.u32 r0, q0
 ; ENABLED-NEXT:    b .LBB0_3
-; ENABLED-NEXT:  .LBB0_8: @ %for.end17
-; ENABLED-NEXT:    add sp, #4
-; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; ENABLED-NEXT:  .LBB0_8:
+; ENABLED-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT:    bx lr
 ;
 ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
 ; NOREDUCTIONS:       @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; NOREDUCTIONS-NEXT:    sub sp, #4
 ; NOREDUCTIONS-NEXT:    cmp r3, #1
-; NOREDUCTIONS-NEXT:    str r0, [sp] @ 4-byte Spill
-; NOREDUCTIONS-NEXT:    blt .LBB0_8
-; NOREDUCTIONS-NEXT:  @ %bb.1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT:    ldr r0, [sp, #36]
-; NOREDUCTIONS-NEXT:    add.w r12, r2, #3
-; NOREDUCTIONS-NEXT:    ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT:    mov.w r8, #0
-; NOREDUCTIONS-NEXT:    mov r9, r12
+; NOREDUCTIONS-NEXT:    it lt
+; NOREDUCTIONS-NEXT:    bxlt lr
+; NOREDUCTIONS-NEXT:  .LBB0_1: @ %for.body.lr.ph
+; NOREDUCTIONS-NEXT:    push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    mov r11, r0
+; NOREDUCTIONS-NEXT:    ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT:    add.w r9, r2, #3
+; NOREDUCTIONS-NEXT:    mov.w r12, #0
+; NOREDUCTIONS-NEXT:    mov r10, r11
 ; NOREDUCTIONS-NEXT:    uxth r0, r0
 ; NOREDUCTIONS-NEXT:    rsbs r5, r0, #0
 ; NOREDUCTIONS-NEXT:    b .LBB0_4
@@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    lsrs r0, r0, #16
 ; NOREDUCTIONS-NEXT:    sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r8, lsl #1]
-; NOREDUCTIONS-NEXT:    add.w r8, r8, #1
+; NOREDUCTIONS-NEXT:    strh.w r0, [r1, r12, lsl #1]
+; NOREDUCTIONS-NEXT:    add.w r12, r12, #1
 ; NOREDUCTIONS-NEXT:    add.w r10, r10, #2
-; NOREDUCTIONS-NEXT:    cmp r8, r3
+; NOREDUCTIONS-NEXT:    cmp r12, r3
 ; NOREDUCTIONS-NEXT:    beq .LBB0_8
 ; NOREDUCTIONS-NEXT:  .LBB0_4: @ %for.body
 ; NOREDUCTIONS-NEXT:    @ =>This Loop Header: Depth=1
 ; NOREDUCTIONS-NEXT:    @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT:    cmp r2, r8
+; NOREDUCTIONS-NEXT:    cmp r2, r12
 ; NOREDUCTIONS-NEXT:    ble .LBB0_2
 ; NOREDUCTIONS-NEXT:  @ %bb.5: @ %vector.ph
 ; NOREDUCTIONS-NEXT:    @ in Loop: Header=BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    bic r0, r9, #3
 ; NOREDUCTIONS-NEXT:    movs r7, #1
 ; NOREDUCTIONS-NEXT:    subs r0, #4
-; NOREDUCTIONS-NEXT:    sub.w r4, r2, r8
+; NOREDUCTIONS-NEXT:    sub.w r4, r2, r12
 ; NOREDUCTIONS-NEXT:    vmov.i32 q1, #0x0
 ; NOREDUCTIONS-NEXT:    add.w r6, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT:    sub.w r0, r12, r8
+; NOREDUCTIONS-NEXT:    adds r0, r2, #3
+; NOREDUCTIONS-NEXT:    sub.w r0, r0, r12
 ; NOREDUCTIONS-NEXT:    bic r0, r0, #3
 ; NOREDUCTIONS-NEXT:    subs r0, #4
 ; NOREDUCTIONS-NEXT:    add.w r0, r7, r0, lsr #2
 ; NOREDUCTIONS-NEXT:    mov r7, r10
 ; NOREDUCTIONS-NEXT:    dls lr, r0
-; NOREDUCTIONS-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT:    mov r0, r11
 ; NOREDUCTIONS-NEXT:  .LBB0_6: @ %vector.body
 ; NOREDUCTIONS-NEXT:    @ Parent Loop BB0_4 Depth=1
 ; NOREDUCTIONS-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
 ; NOREDUCTIONS-NEXT:    vpsel q0, q1, q0
 ; NOREDUCTIONS-NEXT:    vaddv.u32 r0, q0
 ; NOREDUCTIONS-NEXT:    b .LBB0_3
-; NOREDUCTIONS-NEXT:  .LBB0_8: @ %for.end17
-; NOREDUCTIONS-NEXT:    add sp, #4
-; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; NOREDUCTIONS-NEXT:  .LBB0_8:
+; NOREDUCTIONS-NEXT:    pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT:    bx lr
 entry:
   %conv = sext i16 %N to i32
   %cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index e0a61b1f9d956..78dc35b283d18 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) {
 ; CHECK-NEXT:    cmp r0, #1
 ; CHECK-NEXT:    blt .LBB1_4
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    vmov.i32 q1, #0x1
+; CHECK-NEXT:    vmov.i32 q0, #0x1
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    dlstp.32 lr, r0
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
-; CHECK-NEXT:    vcmp.i32 ne, q2, zr
-; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vcmp.i32 ne, q1, zr
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmovt q2, q1
-; CHECK-NEXT:    vaddva.u32 r2, q2
+; CHECK-NEXT:    vmovt q1, q0
+; CHECK-NEXT:    vaddva.u32 r2, q1
 ; CHECK-NEXT:    letp lr, .LBB1_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index c8dd949ca9d88..a9043476b0549 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    .pad #20
+; CHECK-NEXT:    sub sp, #20
 ; CHECK-NEXT:    cmp r3, #8
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    blo.w .LBB16_12
 ; CHECK-NEXT:  @ %bb.1: @ %if.then
 ; CHECK-NEXT:    lsrs.w r12, r3, #2
@@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, r7, #2
 ; CHECK-NEXT:    rsbs r7, r4, #0
-; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r7, r3, #16
-; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add.w r6, r6, r0, lsl #1
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #8
-; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
+; CHECK-NEXT:    add.w r0, r6, r0, lsl #1
 ; CHECK-NEXT:    add.w r5, r0, #8
 ; CHECK-NEXT:    beq.w .LBB16_12
 ; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    ldrh.w lr, [r3, #14]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
-; CHECK-NEXT:    ldrh.w r8, [r3, #12]
+; CHECK-NEXT:    ldrh.w r10, [r3, #12]
 ; CHECK-NEXT:    ldrh r7, [r3, #10]
 ; CHECK-NEXT:    ldrh r4, [r3, #8]
 ; CHECK-NEXT:    ldrh r6, [r3, #6]
 ; CHECK-NEXT:    ldrh.w r9, [r3, #4]
 ; CHECK-NEXT:    ldrh.w r11, [r3, #2]
-; CHECK-NEXT:    ldrh.w r10, [r3]
+; CHECK-NEXT:    ldrh.w r8, [r3]
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r5]
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    adds r0, r5, #2
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmul.f16 q0, q0, r10
+; CHECK-NEXT:    vmul.f16 q0, q0, r8
 ; CHECK-NEXT:    adds r0, r5, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r11
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
@@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    add.w r0, r5, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r6
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT:    add.w r6, r5, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    add.w r0, r5, #14
 ; CHECK-NEXT:    vfma.f16 q0, q1, r7
 ; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT:    adds r5, #16
-; CHECK-NEXT:    vfma.f16 q0, q1, r8
+; CHECK-NEXT:    vfma.f16 q0, q1, r10
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f16 q0, q1, lr
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r3, #16
 ; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r0, [r6], #16
-; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    adds r4, r5, #2
+; CHECK-NEXT:    ldrh r0, [r5], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r6]
+; CHECK-NEXT:    adds r4, r6, #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-14]
-; CHECK-NEXT:    adds r4, r5, #6
+; CHECK-NEXT:    ldrh r0, [r5, #-14]
+; CHECK-NEXT:    adds r4, r6, #6
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-12]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
+; CHECK-NEXT:    ldrh r0, [r5, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #4]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-10]
-; CHECK-NEXT:    add.w r4, r5, #10
+; CHECK-NEXT:    ldrh r0, [r5, #-10]
+; CHECK-NEXT:    add.w r4, r6, #10
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-8]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT:    ldrh r0, [r5, #-8]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #8]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
 ; CHECK-NEXT:    vldrw.u32 q1, [r4]
-; CHECK-NEXT:    ldrh r0, [r6, #-6]
-; CHECK-NEXT:    ldrh r4, [r6, #-2]
+; CHECK-NEXT:    ldrh r0, [r5, #-6]
+; CHECK-NEXT:    ldrh r4, [r5, #-2]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    ldrh r0, [r6, #-4]
-; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
+; CHECK-NEXT:    ldrh r0, [r5, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r6, #12]
 ; CHECK-NEXT:    vfma.f16 q0, q1, r0
-; CHECK-NEXT:    add.w r0, r5, #14
+; CHECK-NEXT:    add.w r0, r6, #14
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    adds r5, #16
+; CHECK-NEXT:    adds r6, #16
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    add.w r5, r3, #16
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r0, r5
+; CHECK-NEXT:    mov r0, r6
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldrh r4, [r6], #2
+; CHECK-NEXT:    ldrh r4, [r5], #2
 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
 ; CHECK-NEXT:    vfma.f16 q0, q1, r4
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12: @ %if.end
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #20
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 28166e455aba2..f7b4548f127bf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
 ; CHECK-NEXT:    ldrh r6, [r0]
-; CHECK-NEXT:    movs r5, #1
-; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
+; CHECK-NEXT:    movs r4, #1
+; CHECK-NEXT:    ldrd r7, r10, [r0, #4]
 ; CHECK-NEXT:    sub.w r0, r6, #8
 ; CHECK-NEXT:    add.w r3, r0, r0, lsr #29
 ; CHECK-NEXT:    and r0, r0, #7
-; CHECK-NEXT:    asrs r7, r3, #3
-; CHECK-NEXT:    cmp r7, #1
+; CHECK-NEXT:    asrs r5, r3, #3
+; CHECK-NEXT:    cmp r5, #1
 ; CHECK-NEXT:    it gt
-; CHECK-NEXT:    asrgt r5, r3, #3
-; CHECK-NEXT:    add.w r3, r4, r6, lsl #2
+; CHECK-NEXT:    asrgt r4, r3, #3
+; CHECK-NEXT:    add.w r3, r7, r6, lsl #2
 ; CHECK-NEXT:    sub.w r9, r3, #4
 ; CHECK-NEXT:    rsbs r3, r6, #0
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT:    add.w r3, r10, #32
-; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r4, [sp] @ 4-byte Spill
+; CHECK-NEXT:    str r6, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    str r0, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB16_6
 ; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
+; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    add.w r7, r7, r0, lsl #2
 ; CHECK-NEXT:    b .LBB16_5
 ; CHECK-NEXT:  .LBB16_4: @ %for.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload
 ; CHECK-NEXT:    wls lr, r0, .LBB16_5
 ; CHECK-NEXT:    b .LBB16_10
 ; CHECK-NEXT:  .LBB16_5: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
-; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
-; CHECK-NEXT:    add.w r4, r0, #16
+; CHECK-NEXT:    add.w r0, r7, r0, lsl #2
+; CHECK-NEXT:    add.w r7, r0, #16
 ; CHECK-NEXT:    beq .LBB16_12
 ; CHECK-NEXT:  .LBB16_6: @ %while.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
@@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
 ; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
 ; CHECK-NEXT:    add.w lr, r10, #8
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
-; CHECK-NEXT:    ldrd r3, r7, [r10]
+; CHECK-NEXT:    ldrd r3, r4, [r10]
 ; CHECK-NEXT:    ldm.w lr, {r0, r5, r6, lr}
 ; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
 ; CHECK-NEXT:    vstrb.8 q0, [r9], #16
-; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT:    vldrw.u32 q0, [r7], #32
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str.w r9, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-28]
 ; CHECK-NEXT:    vmul.f32 q0, q0, r3
-; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT:    vfma.f32 q0, q1, r7
-; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
+; CHECK-NEXT:    vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r7, #-20]
+; CHECK-NEXT:    vfma.f32 q0, q1, r4
+; CHECK-NEXT:    vldrw.u32 q5, [r7, #-16]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r0
-; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vldrw.u32 q2, [r7, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r5
-; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT:    vldrw.u32 q3, [r7, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q2, lr
-; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vfma.f32 q0, q3, r11
 ; CHECK-NEXT:    vfma.f32 q0, q1, r8
 ; CHECK-NEXT:    cmp r0, #16
 ; CHECK-NEXT:    blo .LBB16_9
 ; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r10, #32
 ; CHECK-NEXT:    dls lr, r0
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB16_8: @ %for.body
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
-; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
-; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
+; CHECK-NEXT:    ldm.w r4, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT:    vldrw.u32 q1, [r7], #32
+; CHECK-NEXT:    vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT:    vldrw.u32 q4, [r7, #-20]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-28]
+; CHECK-NEXT:    vldrw.u32 q5, [r7, #-16]
+; CHECK-NEXT:    vldrw.u32 q2, [r7, #-12]
 ; CHECK-NEXT:    vfma.f32 q0, q1, r3
-; CHECK-NEXT:    ldrd r9, r1, [r7, #24]
+; CHECK-NEXT:    ldrd r9, r1, [r4, #24]
 ; CHECK-NEXT:    vfma.f32 q0, q6, r5
-; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT:    vldrw.u32 q3, [r7, #-8]
 ; CHECK-NEXT:    vfma.f32 q0, q4, r6
-; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT:    vldrw.u32 q1, [r7, #-4]
 ; CHECK-NEXT:    vfma.f32 q0, q5, r8
-; CHECK-NEXT:    adds r7, #32
+; CHECK-NEXT:    adds r4, #32
 ; CHECK-NEXT:    vfma.f32 q0, q2, r11
 ; CHECK-NEXT:    vfma.f32 q0, q3, r9
 ; CHECK-NEXT:    vfma.f32 q0, q1, r1
 ; CHECK-NEXT:    le lr, .LBB16_8
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    add.w r4, r10, #32
 ; CHECK-NEXT:    b .LBB16_4
 ; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
 ; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT:    mov r3, r4
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:  .LBB16_11: @ %while.body76
 ; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    ldr r0, [r7], #4
+; CHECK-NEXT:    ldr r0, [r4], #4
 ; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
 ; CHECK-NEXT:    vfma.f32 q0, q1, r0
 ; CHECK-NEXT:    le lr, .LBB16_11
 ; CHECK-NEXT:    b .LBB16_3
 ; CHECK-NEXT:  .LBB16_12:
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index e8b49c1067379..0d86f22a321e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #136
-; CHECK-NEXT:    sub sp, #136
+; CHECK-NEXT:    .pad #120
+; CHECK-NEXT:    sub sp, #120
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    strd r1, r2, [sp, #64] @ 8-byte Folded Spill
 ; CHECK-NEXT:    blt.w .LBB14_5
@@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #8
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    add.w r1, r2, r1, lsr #3
 ; CHECK-NEXT:    str r1, [sp, #60] @ 4-byte Spill
 ; CHECK-NEXT:    adr r1, .LCPI14_0
 ; CHECK-NEXT:    adr r2, .LCPI14_1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #24] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r2]
-; CHECK-NEXT:    add r2, sp, #120
+; CHECK-NEXT:    add r2, sp, #104
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #8] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB14_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB14_3 Depth 2
 ; CHECK-NEXT:    ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, sp, #104
+; CHECK-NEXT:    add.w r10, sp, #88
 ; CHECK-NEXT:    dls lr, r1
 ; CHECK-NEXT:    ldr r7, [sp, #64] @ 4-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #24] @ 16-byte Reload
@@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    ldrh r1, [r1]
 ; CHECK-NEXT:    ldrh.w r12, [r4]
-; CHECK-NEXT:    add r4, sp, #88
+; CHECK-NEXT:    add r4, sp, #72
 ; CHECK-NEXT:    ldrh.w r11, [r5]
 ; CHECK-NEXT:    ldrh r3, [r3]
 ; CHECK-NEXT:    ldrh r5, [r6]
@@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.16 q3[1], r5
 ; CHECK-NEXT:    vmov r2, r5, d5
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i16 q2, #0x18
 ; CHECK-NEXT:    vadd.i16 q6, q6, q2
 ; CHECK-NEXT:    vadd.i16 q5, q5, q2
 ; CHECK-NEXT:    vadd.i16 q4, q4, q2
@@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    cmp r1, r3
 ; CHECK-NEXT:    bne.w .LBB14_2
 ; CHECK-NEXT:  .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #136
+; CHECK-NEXT:    add sp, #120
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    adr r6, .LCPI15_9
-; CHECK-NEXT:    vmov.i32 q2, #0x30
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [r7]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
@@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
 ; CHECK-NEXT:  .LBB15_2: @ %vector.ph
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    adr r1, .LCPI15_3
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    adr r1, .LCPI15_4
 ; CHECK-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-NEXT:    adr r1, .LCPI15_4
+; CHECK-NEXT:    vstrw.32 q2, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_2
-; CHECK-NEXT:    vldrw.u32 q3, [r1]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_10
-; CHECK-NEXT:    vstrw.32 q6, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q3, [r1]
 ; CHECK-NEXT:    adr r1, .LCPI15_11
 ; CHECK-NEXT:    ldr.w r8, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q6, [sp, #264] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q2, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vldrw.u32 q7, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    mov r11, r10
-; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q2, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #216] @ 16-byte Spill
 ; CHECK-NEXT:  .LBB15_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB15_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vadd.i32 q4, q1, r0
-; CHECK-NEXT:    vstrw.32 q7, [sp, #136] @ 16-byte Spill
-; CHECK-NEXT:    vmov r1, lr, d8
-; CHECK-NEXT:    vadd.i32 q7, q7, r0
-; CHECK-NEXT:    vmov r5, r4, d15
-; CHECK-NEXT:    vadd.i32 q6, q0, r0
-; CHECK-NEXT:    vmov r6, r7, d13
+; CHECK-NEXT:    vmov q0, q7
+; CHECK-NEXT:    vstrw.32 q7, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q7, q5, r0
+; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT:    vadd.i32 q5, q0, r0
+; CHECK-NEXT:    vmov q0, q6
+; CHECK-NEXT:    vadd.i32 q6, q4, r0
+; CHECK-NEXT:    vmov r5, r4, d11
+; CHECK-NEXT:    vmov r1, lr, d12
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vmov r6, r7, d15
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #152] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #168] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #216] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, r0
-; CHECK-NEXT:    vstrw.32 q5, [sp, #120] @ 16-byte Spill
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    vstrw.32 q4, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #184] @ 16-byte Reload
 ; CHECK-NEXT:    subs.w r11, r11, #16
-; CHECK-NEXT:    ldrb.w r9, [r1]
-; CHECK-NEXT:    vmov r1, r3, d14
 ; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb.w r9, [r1]
+; CHECK-NEXT:    vmov r1, r3, d10
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    vmov.8 q5[0], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[1], r1
-; CHECK-NEXT:    vmov r1, r3, d12
-; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    vmov.8 q5[1], r1
+; CHECK-NEXT:    vmov r1, r3, d14
+; CHECK-NEXT:    vmov.8 q5[2], r5
 ; CHECK-NEXT:    ldrb r5, [r6]
 ; CHECK-NEXT:    ldrb r6, [r4]
-; CHECK-NEXT:    vmov.8 q7[3], r6
+; CHECK-NEXT:    vmov.8 q5[3], r6
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[0], r1
-; CHECK-NEXT:    vmov r6, r1, d2
-; CHECK-NEXT:    vmov.8 q6[1], r3
-; CHECK-NEXT:    vmov.8 q6[2], r5
-; CHECK-NEXT:    vmov.8 q6[3], r7
+; CHECK-NEXT:    vmov.8 q7[0], r1
+; CHECK-NEXT:    vmov r6, r1, d4
+; CHECK-NEXT:    vmov.8 q7[1], r3
+; CHECK-NEXT:    vmov.8 q7[2], r5
+; CHECK-NEXT:    vmov.8 q7[3], r7
 ; CHECK-NEXT:    ldrb.w r7, [lr]
-; CHECK-NEXT:    vmov.8 q6[4], r9
-; CHECK-NEXT:    vmov.8 q6[5], r7
+; CHECK-NEXT:    vmov.8 q7[4], r9
+; CHECK-NEXT:    vmov.8 q7[5], r7
 ; CHECK-NEXT:    ldrb r4, [r1]
-; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT:    vmov r1, r5, d5
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #280] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb.w r12, [r1]
-; CHECK-NEXT:    vmov r1, r3, d9
+; CHECK-NEXT:    vmov r1, r3, d13
 ; CHECK-NEXT:    ldrb r5, [r5]
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #232] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q6[6], r1
-; CHECK-NEXT:    vmov r1, r7, d0
-; CHECK-NEXT:    vmov.8 q6[7], r3
+; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    vmov r1, r7, d4
+; CHECK-NEXT:    vmov.8 q7[7], r3
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r7, [r7]
-; CHECK-NEXT:    vmov.8 q7[4], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[5], r7
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov.8 q5[4], r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vmov.8 q5[5], r7
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #296] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
-; CHECK-NEXT:    vmov.8 q7[6], r1
+; CHECK-NEXT:    vmov.8 q5[6], r1
 ; CHECK-NEXT:    ldrb r1, [r6]
-; CHECK-NEXT:    vmov r7, r6, d0
-; CHECK-NEXT:    vmov.8 q7[7], r3
-; CHECK-NEXT:    vmov r3, lr, d1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT:    vmov.8 q7[8], r1
-; CHECK-NEXT:    vadd.i32 q0, q0, r0
-; CHECK-NEXT:    vmov.8 q7[9], r4
-; CHECK-NEXT:    vmov r4, r1, d0
-; CHECK-NEXT:    vmov.8 q7[10], r12
-; CHECK-NEXT:    vmov.8 q7[11], r5
+; CHECK-NEXT:    vmov.8 q5[7], r3
+; CHECK-NEXT:    vmov r7, r6, d4
+; CHECK-NEXT:    vmov r3, lr, d5
+; CHECK-NEXT:    vmov.8 q5[8], r1
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
+; CHECK-NEXT:    vmov.8 q5[9], r4
+; CHECK-NEXT:    vmov r4, r1, d4
+; CHECK-NEXT:    vmov.8 q5[10], r12
+; CHECK-NEXT:    vmov.8 q5[11], r5
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #264] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r7, [r7]
 ; CHECK-NEXT:    ldrb r6, [r6]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    ldrb r4, [r4]
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q6[8], r4
-; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov.8 q6[9], r1
-; CHECK-NEXT:    vadd.i32 q0, q5, r0
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q7[8], r4
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    vmov.8 q7[9], r1
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #216] @ 16-byte Reload
 ; CHECK-NEXT:    ldrb r5, [r5]
 ; CHECK-NEXT:    ldrb r4, [r4]
-; CHECK-NEXT:    vmov.8 q6[10], r5
-; CHECK-NEXT:    vmov.8 q6[11], r4
-; CHECK-NEXT:    vmov.8 q6[12], r7
-; CHECK-NEXT:    vmov.8 q6[13], r6
-; CHECK-NEXT:    vmov.8 q6[14], r3
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov.8 q7[10], r5
+; CHECK-NEXT:    vmov.8 q7[11], r4
+; CHECK-NEXT:    vmov.8 q7[12], r7
+; CHECK-NEXT:    vmov.8 q7[13], r6
+; CHECK-NEXT:    vmov.8 q7[14], r3
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[12], r1
+; CHECK-NEXT:    vmov.8 q5[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q1, r0
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vmov.8 q5[13], r1
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q1, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
-; CHECK-NEXT:    vmov.8 q7[14], r1
+; CHECK-NEXT:    vmov.8 q5[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
-; CHECK-NEXT:    vmov.8 q7[15], r1
+; CHECK-NEXT:    vmov.8 q5[15], r1
 ; CHECK-NEXT:    ldrb.w r1, [lr]
-; CHECK-NEXT:    vmov.8 q6[15], r1
-; CHECK-NEXT:    vmov r1, r3, d0
-; CHECK-NEXT:    vadd.i8 q6, q6, q7
+; CHECK-NEXT:    vmov.8 q7[15], r1
+; CHECK-NEXT:    vmov r1, r3, d4
+; CHECK-NEXT:    vadd.i8 q5, q7, q5
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    ldrb r3, [r3]
 ; CHECK-NEXT:    vmov.8 q7[0], r1
 ; CHECK-NEXT:    vmov.8 q7[1], r3
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q3, r0
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q3, q3, q2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q4, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[2], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[3], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[4], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[5], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q5, r0
-; CHECK-NEXT:    vadd.i32 q5, q5, q2
-; CHECK-NEXT:    vstrw.32 q5, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #120] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q6, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[6], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[7], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[8], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[9], r1
-; CHECK-NEXT:    vmov r1, r3, d1
-; CHECK-NEXT:    vadd.i32 q0, q4, r0
-; CHECK-NEXT:    vadd.i32 q4, q4, q2
-; CHECK-NEXT:    vstrw.32 q4, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT:    vmov r1, r3, d5
+; CHECK-NEXT:    vadd.i32 q2, q0, r0
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[10], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[11], r1
-; CHECK-NEXT:    vmov r1, r3, d0
+; CHECK-NEXT:    vmov r1, r3, d4
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[12], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[13], r1
-; CHECK-NEXT:    vmov r1, r3, d1
+; CHECK-NEXT:    vmov r1, r3, d5
 ; CHECK-NEXT:    ldrb r1, [r1]
 ; CHECK-NEXT:    vmov.8 q7[14], r1
 ; CHECK-NEXT:    ldrb r1, [r3]
 ; CHECK-NEXT:    vmov.8 q7[15], r1
-; CHECK-NEXT:    vadd.i8 q0, q6, q7
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #136] @ 16-byte Reload
-; CHECK-NEXT:    vstrb.8 q0, [r8], #16
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #168] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q7, q7, q2
+; CHECK-NEXT:    vadd.i8 q2, q5, q7
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT:    vstrb.8 q2, [r8], #16
+; CHECK-NEXT:    vmov.i32 q2, #0x30
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q1, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q4, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q6, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q5, q5, q2
+; CHECK-NEXT:    vadd.i32 q3, q3, q2
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vadd.i32 q4, q4, q2
+; CHECK-NEXT:    vadd.i32 q1, q1, q2
+; CHECK-NEXT:    vadd.i32 q6, q6, q2
+; CHECK-NEXT:    vstrw.32 q0, [sp, #280] @ 16-byte Spill
 ; CHECK-NEXT:    bne.w .LBB15_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB15_2 Depth=1
@@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    blt .LBB18_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI18_0
+; CHECK-NEXT:    adr r3, .LCPI18_0
 ; CHECK-NEXT:    adr r4, .LCPI18_1
 ; CHECK-NEXT:    adr r5, .LCPI18_2
 ; CHECK-NEXT:    adr r6, .LCPI18_3
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [r4]
-; CHECK-NEXT:    vldrw.u32 q3, [lr]
+; CHECK-NEXT:    vldrw.u32 q3, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r1
 ; CHECK-NEXT:    vadd.i32 q2, q2, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index dad856c0677a1..00a998cba6426 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() {
 ; CHECK-NEXT:    vmov.i32 q5, #0x0
 ; CHECK-NEXT:    vpsel q6, q4, q3
 ; CHECK-NEXT:    vstrh.16 q6, [r0]
-; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vmov.i32 q6, #0x0
 ; CHECK-NEXT:    cbz r1, .LBB0_2
 ; CHECK-NEXT:    le .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: @ %for.cond4.preheader
@@ -135,12 +135,12 @@ vector.body115:                                   ; preds = %vector.body115, %ve
 define dso_local i32 @e() #0 {
 ; CHECK-LABEL: e:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #408
-; CHECK-NEXT:    sub sp, #408
+; CHECK-NEXT:    .pad #392
+; CHECK-NEXT:    sub sp, #392
 ; CHECK-NEXT:    movw r7, :lower16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s15, .LCPI1_1
 ; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
@@ -148,18 +148,16 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    mov r4, r7
 ; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    ldr r6, [r4, #8]!
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    ldr r0, [r3, #4]!
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    ldr r0, [r3, #4]!
 ; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vldr s12, .LCPI1_0
+; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
 ; CHECK-NEXT:    vdup.32 q7, r3
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #92]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #76]
 ; CHECK-NEXT:    vmov q0, q7
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vmov q4, q7
@@ -168,7 +166,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov s21, r2
 ; CHECK-NEXT:    movs r1, #64
 ; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    str r0, [sp, #40]
+; CHECK-NEXT:    str r0, [sp, #24]
 ; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    str r6, [r0]
 ; CHECK-NEXT:    vmov.f32 s23, s15
@@ -186,12 +184,12 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vmov.32 q4[0], r8
 ; CHECK-NEXT:    @ implicit-def: $r2
-; CHECK-NEXT:    str.w r8, [sp, #44]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #60]
-; CHECK-NEXT:    strh.w r12, [sp, #406]
+; CHECK-NEXT:    str.w r8, [sp, #28]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #44]
+; CHECK-NEXT:    strh.w r12, [sp, #390]
 ; CHECK-NEXT:    wlstp.8 lr, r1, .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    vstrb.8 q0, [r2], #16
 ; CHECK-NEXT:    letp lr, .LBB1_1
 ; CHECK-NEXT:  .LBB1_2: @ %entry
@@ -199,7 +197,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    str.w r8, [r7]
 ; CHECK-NEXT:    vstrw.32 q4, [r0]
 ; CHECK-NEXT:    vstrw.32 q2, [r0]
-; CHECK-NEXT:    str.w r12, [sp, #324]
+; CHECK-NEXT:    str.w r12, [sp, #308]
 ; CHECK-NEXT:  .LBB1_3: @ %for.cond
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    b .LBB1_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index f90af3cc5ba24..2587a0bb6e00b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    movs r4, #1
 ; CHECK-NEXT:    ldr r3, [r0]
 ; CHECK-NEXT:    add.w r11, r3, r12, lsl #2
-; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
-; CHECK-NEXT:    lsl.w r9, r12, #3
+; CHECK-NEXT:    add.w r6, r3, r12, lsl #3
+; CHECK-NEXT:    lsl.w r10, r12, #3
 ; CHECK-NEXT:  .LBB1_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
 ; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
+; CHECK-NEXT:    add.w r9, r4, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    add.w r10, r4, #1
 ; CHECK-NEXT:    mov r3, r11
-; CHECK-NEXT:    mov r0, r7
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    mov r0, r6
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB1_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
@@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s2, s2, s3
-; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r9, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    add r11, r9
+; CHECK-NEXT:    add r11, r10
 ; CHECK-NEXT:    vadd.f32 s6, s6, s7
-; CHECK-NEXT:    add r7, r9
+; CHECK-NEXT:    add r6, r10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vadd.f32 s2, s4, s6
@@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    .pad #24
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #3
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo .LBB2_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r9, [r0, #8]
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
-; CHECK-NEXT:    adds r3, #3
+; CHECK-NEXT:    add.w r3, r9, #3
 ; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    add.w r10, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r0, r9, r9, lsl #1
 ; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    add.w r10, r1, r9, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r9, lsl #3
+; CHECK-NEXT:    add.w r1, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r3, r5, r3, lsr #2
+; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
 ; CHECK-NEXT:    lsl.w r11, r0, #2
-; CHECK-NEXT:    add.w r1, r5, r3, lsr #2
-; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB2_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    adds r0, r5, #2
-; CHECK-NEXT:    adds r2, r5, #1
-; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    mov r3, r10
 ; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    mov r4, r10
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r9
 ; CHECK-NEXT:  .LBB2_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
@@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
-; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r5, #1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    add r9, r11
+; CHECK-NEXT:    add r10, r11
 ; CHECK-NEXT:    vadd.f32 s6, s6, s7
-; CHECK-NEXT:    add.w r0, r1, r2, lsl #2
+; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    add r12, r11
 ; CHECK-NEXT:    vadd.f32 s2, s2, s3
-; CHECK-NEXT:    add r10, r11
+; CHECK-NEXT:    add r1, r11
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s8, s8, s10
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    add.w r0, r1, r5, lsl #2
-; CHECK-NEXT:    adds r5, #3
+; CHECK-NEXT:    add.w r0, r2, r5, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
+; CHECK-NEXT:    adds r0, r5, #2
+; CHECK-NEXT:    adds r5, #3
+; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r5, r0
 ; CHECK-NEXT:    blo .LBB2_2
 ; CHECK-NEXT:  .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #24
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    .pad #40
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    .pad #24
+; CHECK-NEXT:    sub sp, #24
+; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
-; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
-; CHECK-NEXT:    blo.w .LBB3_5
+; CHECK-NEXT:    blo .LBB3_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
 ; CHECK-NEXT:    ldr r2, [r0, #8]
 ; CHECK-NEXT:    movs r6, #1
@@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    add.w r0, r2, r2, lsl #1
 ; CHECK-NEXT:    add.w r12, r1, r2, lsl #2
 ; CHECK-NEXT:    add.w r8, r1, r2, lsl #3
-; CHECK-NEXT:    add.w r9, r1, r2, lsl #4
-; CHECK-NEXT:    add.w r11, r1, r0, lsl #2
+; CHECK-NEXT:    add.w r10, r1, r2, lsl #4
+; CHECK-NEXT:    add.w r9, r1, r0, lsl #2
 ; CHECK-NEXT:    adds r0, r2, #3
 ; CHECK-NEXT:    bic r0, r0, #3
 ; CHECK-NEXT:    subs r0, #4
 ; CHECK-NEXT:    add.w r0, r6, r0, lsr #2
-; CHECK-NEXT:    strd r0, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT:    strd r0, r2, [sp, #4] @ 8-byte Folded Spill
 ; CHECK-NEXT:    lsls r0, r2, #4
-; CHECK-NEXT:    ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB3_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT:    adds r0, r6, #3
-; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #2
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r0, r6, #1
-; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    mov r0, r8
-; CHECK-NEXT:    mov r5, r11
-; CHECK-NEXT:    mov r4, r9
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmov q3, q0
+; CHECK-NEXT:    mov r5, r9
+; CHECK-NEXT:    mov r4, r10
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r7
 ; CHECK-NEXT:  .LBB3_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
@@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
-; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #1
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
@@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s12, [r0]
 ; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
-; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    vstr s8, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #2
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s4, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r0, r6, #3
+; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
 ; CHECK-NEXT:    vstr s0, [r0]
-; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r12, r0
 ; CHECK-NEXT:    add r8, r0
-; CHECK-NEXT:    add r11, r0
 ; CHECK-NEXT:    add r9, r0
-; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    add r10, r0
+; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r6, r0
 ; CHECK-NEXT:    blo .LBB3_2
 ; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #40
+; CHECK-NEXT:    add sp, #24
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #5
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB4_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r12, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r12, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r3, lsl #2
-; CHECK-NEXT:    lsls r1, r1, #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    add.w r1, r12, r12, lsl #2
+; CHECK-NEXT:    lsls r1, r1, #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB4_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    add.w r10, r0, #2
+; CHECK-NEXT:    ldr r4, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    add.w r11, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB4_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    add.w r9, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
 ; CHECK-NEXT:    vldrw.u32 q6, [r3], #16
-; CHECK-NEXT:    add.w r12, r9, r5
+; CHECK-NEXT:    add.w r10, r9, r5
 ; CHECK-NEXT:    vfma.f32 q3, q6, q5
 ; CHECK-NEXT:    vldrw.u32 q6, [r9]
-; CHECK-NEXT:    add.w r6, r12, r5
+; CHECK-NEXT:    add.w r6, r10, r5
 ; CHECK-NEXT:    vfma.f32 q4, q6, q5
-; CHECK-NEXT:    vldrw.u32 q6, [r12]
+; CHECK-NEXT:    vldrw.u32 q6, [r10]
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vfma.f32 q2, q6, q5
 ; CHECK-NEXT:    vldrw.u32 q6, [r6]
@@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    vadd.f32 s1, s16, s18
-; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vadd.f32 s6, s8, s10
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    add.w r1, r2, r10, lsl #2
+; CHECK-NEXT:    adds r1, r0, #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
+; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r8, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
-; CHECK-NEXT:    blo.w .LBB4_2
+; CHECK-NEXT:    blo .LBB4_2
 ; CHECK-NEXT:  .LBB4_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    .pad #16
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #6
-; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB5_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r12, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r12, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r8, r1, r12, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r12, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    add.w r1, r3, r3, lsl #1
+; CHECK-NEXT:    add.w r1, r12, r12, lsl #1
 ; CHECK-NEXT:    lsls r1, r1, #3
 ; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB5_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT:    vmov.i32 q1, #0x0
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    add.w r11, r0, #2
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q1, #0x0
 ; CHECK-NEXT:    mov r3, r8
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov q4, q1
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    dlstp.32 lr, r7
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB5_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    add.w r12, r3, r5
+; CHECK-NEXT:    add.w r10, r3, r5
 ; CHECK-NEXT:    vldrw.u32 q6, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q7, [r3], #16
-; CHECK-NEXT:    add.w r10, r12, r5
+; CHECK-NEXT:    add.w r11, r10, r5
 ; CHECK-NEXT:    vfma.f32 q4, q7, q6
-; CHECK-NEXT:    vldrw.u32 q7, [r12]
-; CHECK-NEXT:    add.w r6, r10, r5
-; CHECK-NEXT:    vfma.f32 q5, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r10]
+; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vfma.f32 q5, q7, q6
+; CHECK-NEXT:    vldrw.u32 q7, [r11]
 ; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vfma.f32 q2, q7, q6
 ; CHECK-NEXT:    vldrw.u32 q7, [r6]
@@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s2
-; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    vstr s3, [r1]
-; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
+; CHECK-NEXT:    adds r1, r0, #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    vadd.f32 s6, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
+; CHECK-NEXT:    adds r0, #6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
 ; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r8, r1
-; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB5_2
 ; CHECK-NEXT:  .LBB5_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #72
-; CHECK-NEXT:    sub sp, #72
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #7
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB6_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r10, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r10, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r9, r1, r10, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r5, r3, #2
+; CHECK-NEXT:    lsl.w r5, r10, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    rsb r1, r3, r3, lsl #3
-; CHECK-NEXT:    lsls r1, r1, #2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    rsb r1, r10, r10, lsl #3
+; CHECK-NEXT:    lsls r1, r1, #2
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB6_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #3
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT:    vmov.i32 q2, #0x0
-; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
 ; CHECK-NEXT:    adds r4, r0, #2
 ; CHECK-NEXT:    add.w r8, r0, #1
-; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
 ; CHECK-NEXT:    mov r3, r9
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov q5, q2
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q1, q2
-; CHECK-NEXT:    mov r12, r7
-; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    dls lr, r6
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    mov r12, r10
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB6_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r12
-; CHECK-NEXT:    add.w r10, r3, r5
+; CHECK-NEXT:    add.w r11, r3, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT:    add.w r11, r10, r5
+; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r10]
-; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
@@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r6, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q1, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r7]
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds r6, r7, r5
-; CHECK-NEXT:    vstrw.32 q1, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q1, q0, q7
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q1, q3
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q3, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r6]
+; CHECK-NEXT:    vldrwt.u32 q0, [r7]
 ; CHECK-NEXT:    vmov q4, q5
-; CHECK-NEXT:    adds r7, r6, r5
+; CHECK-NEXT:    adds r6, r7, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r7]
+; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q2, q0, q7
 ; CHECK-NEXT:    le lr, .LBB6_3
@@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s9, s18, s19
 ; CHECK-NEXT:    vadd.f32 s11, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 s2, s3, s1
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
+; CHECK-NEXT:    vadd.f32 s2, s3, s1
 ; CHECK-NEXT:    vadd.f32 s4, s4, s6
-; CHECK-NEXT:    vstr s0, [r1]
-; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
-; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vstr s2, [r1]
-; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
+; CHECK-NEXT:    vstr s0, [r1]
+; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s8, s8, s10
 ; CHECK-NEXT:    vadd.f32 s6, s7, s5
-; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    vstr s2, [r1]
+; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s10, s11, s9
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
 ; CHECK-NEXT:    vadd.f32 s12, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s10, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #6
+; CHECK-NEXT:    adds r0, #7
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r9, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB6_2
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #72
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #88
-; CHECK-NEXT:    sub sp, #88
-; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
 ; CHECK-NEXT:    ldr r1, [r0, #4]
 ; CHECK-NEXT:    subs r1, #8
-; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
 ; CHECK-NEXT:    cmp r1, #2
 ; CHECK-NEXT:    blo.w .LBB7_5
 ; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT:    ldr r3, [r0, #8]
+; CHECK-NEXT:    ldr.w r11, [r0, #8]
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    adds r0, r3, #3
-; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT:    add.w r0, r11, #3
 ; CHECK-NEXT:    bic r0, r0, #3
-; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
+; CHECK-NEXT:    add.w r12, r1, r11, lsl #2
 ; CHECK-NEXT:    subs r1, r0, #4
 ; CHECK-NEXT:    movs r0, #1
-; CHECK-NEXT:    lsls r6, r3, #2
+; CHECK-NEXT:    lsl.w r6, r11, #2
 ; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
-; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    lsls r1, r3, #5
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    lsl.w r1, r11, #5
+; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
 ; CHECK-NEXT:  .LBB7_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT:    adds r1, r0, #7
-; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #6
-; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #5
-; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT:    adds r1, r0, #4
-; CHECK-NEXT:    ldr.w r9, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT:    vmov.i32 q3, #0x0
-; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT:    ldr.w r9, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    adds r4, r0, #3
 ; CHECK-NEXT:    add.w r8, r0, #2
 ; CHECK-NEXT:    adds r1, r0, #1
-; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vmov.i32 q3, #0x0
 ; CHECK-NEXT:    mov r3, r12
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q6, q3
-; CHECK-NEXT:    vmov q4, q3
-; CHECK-NEXT:    vmov q7, q3
-; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    mov r10, r7
-; CHECK-NEXT:    vstrw.32 q3, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q3, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT:    dls lr, r5
+; CHECK-NEXT:    vmov.i32 q5, #0x0
+; CHECK-NEXT:    vmov.i32 q6, #0x0
+; CHECK-NEXT:    vmov.i32 q4, #0x0
+; CHECK-NEXT:    vmov.i32 q7, #0x0
+; CHECK-NEXT:    vmov.i32 q2, #0x0
+; CHECK-NEXT:    mov r10, r11
+; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    dls lr, r7
 ; CHECK-NEXT:  .LBB7_3: @ %vector.body
 ; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r10
-; CHECK-NEXT:    add.w r11, r3, r6
+; CHECK-NEXT:    adds r5, r3, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
-; CHECK-NEXT:    add.w r5, r11, r6
+; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q6, q1, q0
-; CHECK-NEXT:    vldrwt.u32 q1, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
-; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q3, q4
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmov q5, q4
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vmov q3, q2
 ; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
-; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    adds r5, r7, r6
-; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q4, q1, q0
+; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vmov q4, q5
+; CHECK-NEXT:    adds r7, r5, r6
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q4, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vmov q5, q6
-; CHECK-NEXT:    add r5, r6
+; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q3, q1, q0
 ; CHECK-NEXT:    le lr, .LBB7_3
@@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vadd.f32 s6, s24, s25
 ; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s9, s18, s19
 ; CHECK-NEXT:    vadd.f32 s11, s16, s17
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    vadd.f32 s13, s18, s19
@@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s3, s20, s21
-; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
 ; CHECK-NEXT:    vadd.f32 s12, s7, s5
 ; CHECK-NEXT:    vstr s10, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vstr s14, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT:    vadd.f32 s4, s3, s1
+; CHECK-NEXT:    adds r1, r0, #4
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
+; CHECK-NEXT:    vadd.f32 s4, s3, s1
 ; CHECK-NEXT:    vstr s8, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #5
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s12, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #6
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s4, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT:    adds r1, r0, #7
+; CHECK-NEXT:    adds r0, #8
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
-; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-NEXT:    add r12, r1
-; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    cmp r0, r1
 ; CHECK-NEXT:    blo.w .LBB7_2
 ; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
-; CHECK-NEXT:    add sp, #88
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
index 29c4fb902bf36..413c4a14a2593 100644
--- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
@@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB34_1: @ %for.body.preheader
-; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB34_2: @ %for.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
-; CHECK-NEXT:    vfma.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r1], #16
+; CHECK-NEXT:    vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB34_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) {
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
 ; CHECK-NEXT:  .LBB35_1: @ %while.body.lr.ph
-; CHECK-NEXT:    vmov.f32 q0, #1.000000e+01
 ; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB35_2: @ %while.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vfma.f32 q3, q2, q1
-; CHECK-NEXT:    vstrw.32 q3, [r0], #16
+; CHECK-NEXT:    vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vstrw.32 q2, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB35_2
 ; CHECK-NEXT:  @ %bb.3: @ %while.end
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index e845070d57904..62482c11d27e2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n)
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    blt .LBB5_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
-; CHECK-NEXT:    adr.w lr, .LCPI5_0
-; CHECK-NEXT:    adr r4, .LCPI5_1
+; CHECK-NEXT:    adr r4, .LCPI5_0
+; CHECK-NEXT:    adr r3, .LCPI5_1
 ; CHECK-NEXT:    adr r5, .LCPI5_2
 ; CHECK-NEXT:    adr r6, .LCPI5_3
-; CHECK-NEXT:    vldrw.u32 q2, [r4]
+; CHECK-NEXT:    vldrw.u32 q2, [r3]
+; CHECK-NEXT:    vldrw.u32 q3, [r4]
 ; CHECK-NEXT:    vldrw.u32 q0, [r6]
 ; CHECK-NEXT:    vldrw.u32 q1, [r5]
-; CHECK-NEXT:    vldrw.u32 q3, [lr]
+; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q0, q0, r1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r1
-; CHECK-NEXT:    vadd.i32 q2, q2, r1
 ; CHECK-NEXT:    vadd.i32 q3, q3, r1
 ; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    movs r4, #3
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index f9948db66b3b3..c92c2be2834e7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
@@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    vmov r2, r3, d15
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
@@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpsel q6, q1, q7
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
 ; CHECK-NEXT:    vmov r2, r3, d1
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q1, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vpsel q5, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
 ; CHECK-NEXT:    vmov.16 q3[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q5[1]
@@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    orr.w lr, lr, r3
 ; CHECK-NEXT:    add r12, r2
 ; CHECK-NEXT:    vmov r3, r2, d15
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vmov.u8 r2, q2[3]
@@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
 ; CHECK-NEXT:    vpsel q6, q1, q7
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
 ; CHECK-NEXT:    adc.w r3, r3, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 63b1431ac0fa4..9f551836c8b17 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q6, q2, q0
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.u8 r0, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov.16 q0[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q6[1]
 ; CHECK-NEXT:    vmov.16 q0[1], r0
@@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q0[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q6[7]
 ; CHECK-NEXT:    vmov.16 q0[7], r0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vmov.u8 r2, q3[0]
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q7, q2, q4
 ; CHECK-NEXT:    vmov.u16 r0, q7[2]
 ; CHECK-NEXT:    vmov.u16 r1, q7[0]
@@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.u16 r3, q7[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q0, q0, q4
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q4, q0
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
 ; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
@@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r0, s30
 ; CHECK-NEXT:    vmov r1, s28
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0xff
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r0, s18
 ; CHECK-NEXT:    vmov r1, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    umull r0, r2, r0, r2
 ; CHECK-NEXT:    umull r1, r3, r1, r3
@@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.u16 r3, q6[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q0, #0xff
 ; CHECK-NEXT:    vpsel q0, q0, q4
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
@@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vmov.s8 r2, q1[0]
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r0, q5[0]
-; CHECK-NEXT:    vmov.s8 r3, q3[0]
+; CHECK-NEXT:    vmov.s8 r2, q1[0]
 ; CHECK-NEXT:    vmov.16 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[1]
 ; CHECK-NEXT:    vmov.16 q4[1], r0
@@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q4[6], r0
 ; CHECK-NEXT:    vmov.u8 r0, q5[7]
 ; CHECK-NEXT:    vmov.16 q4[7], r0
-; CHECK-NEXT:    smull r2, r3, r3, r2
+; CHECK-NEXT:    vmov.s8 r3, q3[0]
 ; CHECK-NEXT:    vcmp.i16 ne, q4, zr
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov.u16 r0, q6[2]
 ; CHECK-NEXT:    vmov.u16 r1, q6[0]
@@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.s8 r3, q3[3]
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
@@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    vmov.16 q6[7], r2
 ; CHECK-NEXT:    vmov.s8 r0, q1[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
+; CHECK-NEXT:    vmov.i8 q6, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q6
 ; CHECK-NEXT:    vmov.s8 r1, q3[8]
-; CHECK-NEXT:    vpsel q5, q2, q7
-; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov.u16 r2, q5[2]
 ; CHECK-NEXT:    vmov.u16 r3, q5[0]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    vmov.u16 r2, q5[3]
 ; CHECK-NEXT:    vmov.u16 r3, q5[1]
+; CHECK-NEXT:    smull r0, r1, r1, r0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vpsel q6, q2, q6
 ; CHECK-NEXT:    vmov r2, r3, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
@@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #32
-; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q6, q2, q0
-; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vmov.u8 r2, q6[0]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov.16 q0[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q6[1]
 ; CHECK-NEXT:    vmov.16 q0[1], r2
@@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q0[6], r2
 ; CHECK-NEXT:    vmov.u8 r2, q6[7]
 ; CHECK-NEXT:    vmov.16 q0[7], r2
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vmov.u8 r4, q3[2]
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
 ; CHECK-NEXT:    vpsel q7, q2, q4
 ; CHECK-NEXT:    vmov.u16 r2, q7[2]
 ; CHECK-NEXT:    vmov.u16 r3, q7[0]
@@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    vmov r5, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0xff
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r3, r3, r2
 ; CHECK-NEXT:    umull r4, r5, r5, r4
@@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u16 r4, q7[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vpsel q0, q0, q4
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q4, q0
 ; CHECK-NEXT:    vmov r5, r4, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
 ; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
@@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s18
 ; CHECK-NEXT:    vmov r2, s30
 ; CHECK-NEXT:    vmov r3, s28
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0xff
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov r5, s2
 ; CHECK-NEXT:    vmov r2, s18
 ; CHECK-NEXT:    vmov r3, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q4, #0x0
 ; CHECK-NEXT:    vmov r4, s0
 ; CHECK-NEXT:    umull r2, r5, r2, r5
 ; CHECK-NEXT:    umull r3, r4, r3, r4
@@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.u16 r4, q6[5]
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q0, #0xff
 ; CHECK-NEXT:    vpsel q0, q0, q4
 ; CHECK-NEXT:    vmov r5, r4, d0
 ; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
@@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vmov q3, q0
 ; CHECK-NEXT:    vcmp.i8 eq, q2, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.i8 q2, #0xff
 ; CHECK-NEXT:    vpsel q5, q2, q0
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.i8 q0, #0x0
 ; CHECK-NEXT:    vmov.u8 r2, q5[0]
 ; CHECK-NEXT:    vmov.s8 r4, q1[2]
 ; CHECK-NEXT:    vmov.16 q4[0], r2
@@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adc.w r12, r12, r2
 ; CHECK-NEXT:    vmov r2, r3, d15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.i8 q7, #0x0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
 ; CHECK-NEXT:    vmov.s8 r2, q1[3]
 ; CHECK-NEXT:    vmov.s8 r3, q3[3]
@@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vmov.i8 q0, #0x0
+; CHECK-NEXT:    vpsel q6, q2, q0
 ; CHECK-NEXT:    vmov r5, r4, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
@@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    vmov.16 q6[7], r5
 ; CHECK-NEXT:    vmov.s8 r2, q1[8]
 ; CHECK-NEXT:    vcmp.i16 ne, q6, zr
+; CHECK-NEXT:    vmov.i8 q6, #0x0
+; CHECK-NEXT:    vpsel q5, q2, q6
 ; CHECK-NEXT:    vmov.s8 r3, q3[8]
-; CHECK-NEXT:    vpsel q5, q2, q7
-; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov.u16 r5, q5[2]
 ; CHECK-NEXT:    vmov.u16 r4, q5[0]
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
 ; CHECK-NEXT:    vmov.u16 r5, q5[3]
 ; CHECK-NEXT:    vmov.u16 r4, q5[1]
+; CHECK-NEXT:    smull r2, r3, r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
 ; CHECK-NEXT:    vcmp.i32 ne, q0, zr
-; CHECK-NEXT:    vpsel q6, q2, q7
+; CHECK-NEXT:    vpsel q6, q2, q6
 ; CHECK-NEXT:    vmov r5, r4, d12
 ; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
@@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
index 0c349c3aa8ec1..cba394f9e55d6 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
@@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT:   [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
   ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit
   ; CHECK-NEXT:   [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-  ; CHECK-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5)
   ; CHECK-NEXT:   [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]]
   ; CHECK-NEXT:   [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
   ; CHECK-NEXT:   MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
+  ; CHECK-NEXT:   MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5)
   ; CHECK-NEXT:   MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg
   ; CHECK-NEXT:   MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]]
   ; CHECK-NEXT:   MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags
@@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
   ; CHECK-NEXT:   CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
-  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
-  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
+  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
+  ; CHECK-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]]
   ; CHECK-NEXT:   JCC_1 %bb.5, 13, implicit $eflags
   ; CHECK-NEXT:   JMP_1 %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags
   ; CHECK-NEXT:   [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
-  ; CHECK-NEXT:   MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13)
-  ; CHECK-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
-  ; CHECK-NEXT:   undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]]
+  ; CHECK-NEXT:   [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
+  ; CHECK-NEXT:   undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]]
   ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
   ; CHECK-NEXT:   [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
   ; CHECK-NEXT:   JMP_1 %bb.6
@@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
   ; CHECK-NEXT: bb.6.for.body17:
   ; CHECK-NEXT:   successors: %bb.6(0x7c000000), %bb.5(0x04000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit
-  ; CHECK-NEXT:   [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
-  ; CHECK-NEXT:   [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]]
-  ; CHECK-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
-  ; CHECK-NEXT:   [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg
-  ; CHECK-NEXT:   [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]]
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]]
-  ; CHECK-NEXT:   [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]]
-  ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]]
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]]
-  ; CHECK-NEXT:   [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]]
-  ; CHECK-NEXT:   [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
-  ; CHECK-NEXT:   [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]]
-  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
-  ; CHECK-NEXT:   PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
+  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit
+  ; CHECK-NEXT:   [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]]
+  ; CHECK-NEXT:   [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg
+  ; CHECK-NEXT:   [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]]
+  ; CHECK-NEXT:   [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]]
+  ; CHECK-NEXT:   [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]]
+  ; CHECK-NEXT:   [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]]
+  ; CHECK-NEXT:   [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
+  ; CHECK-NEXT:   PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
   ; CHECK-NEXT:   [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
-  ; CHECK-NEXT:   CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags
+  ; CHECK-NEXT:   [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
+  ; CHECK-NEXT:   CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags
   ; CHECK-NEXT:   JCC_1 %bb.6, 12, implicit $eflags
   ; CHECK-NEXT:   JMP_1 %bb.5
 entry:
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index bf6b09674e187..b428ce457ff40 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movl (%r8), %edx
 ; CHECK-NEXT:    leal 8(,%rbx,8), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    leaq 8(%rsi), %rax
-; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    leaq 32(%rsi), %r11
 ; CHECK-NEXT:    leaq 8(,%rbx,8), %rbx
 ; CHECK-NEXT:    xorl %r14d, %r14d
@@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    jae .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT:    leaq 8(%rsi), %r9
+; CHECK-NEXT:    addq %r9, %rax
 ; CHECK-NEXT:    leaq (%rax,%r10,8), %rax
 ; CHECK-NEXT:    cmpq %r15, %rax
 ; CHECK-NEXT:    ja .LBB1_14
diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
index 10ee445788757..d355374097651 100644
--- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
+++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
@@ -7,8 +7,8 @@
 # CHECK: jne
 # CHECK: andl    $-16, %edx
 # CHECK: xorl    %ebx, %ebx
-# CHECK: movl    -16(%ebp), %esi
-# CHECK: xorl    %eax, %eax
+# CHECK: xorl    %esi, %esi
+# CHECK: movl    %eax, %ecx
 
 name:            test
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll
index c2728f78c81e4..68cb24dece0a9 100644
--- a/llvm/test/CodeGen/X86/inalloca-invoke.ll
+++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll
@@ -23,7 +23,6 @@ blah:
 ; CHECK:  pushl   %eax
 ; CHECK:  subl    $20, %esp
 ; CHECK:  movl %esp, %[[beg:[^ ]*]]
-; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 
   call void @begin(ptr sret(%Iter) %temp.lvalue)
 ; CHECK:  calll _begin
@@ -32,6 +31,7 @@ blah:
           to label %invoke.cont unwind label %lpad
 
 ;  Uses end as sret param.
+; CHECK:  leal 12(%[[beg]]), %[[end:[^ ]*]]
 ; CHECK:  pushl %[[end]]
 ; CHECK:  calll _plus
 
diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll
index 72a4832062765..26ed2a37fb647 100644
--- a/llvm/test/CodeGen/X86/licm-regpressure.ll
+++ b/llvm/test/CodeGen/X86/licm-regpressure.ll
@@ -1,14 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; This tests currently fails as MachineLICM does not compute register pressure
-; correctly. More details: llvm.org/PR23143
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s
 
-; MachineLICM should take register pressure into account.
-; CHECK-NOT: Spill
+; FIXME: MachineLICM does not compute register pressure correctly and we end up
+; emitting too many ADD64ri32s. More details: llvm.org/PR23143
 
 %struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
 
 define void @test(i1 %b, ptr %a) nounwind {
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $edi, $rsi
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit
+  ; CHECK-NEXT:   [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags
+  ; CHECK-NEXT:   [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.loop-body:
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[COPY]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_1]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_2]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_3]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_4]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   $rdi = COPY [[ADD64ri32_5]]
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+  ; CHECK-NEXT:   ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+  ; CHECK-NEXT:   TEST8ri [[COPY2]], 1, implicit-def $eflags
+  ; CHECK-NEXT:   JCC_1 %bb.1, 5, implicit $eflags
+  ; CHECK-NEXT:   JMP_1 %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.loop-exit:
+  ; CHECK-NEXT:   RET 0
 entry:
   br label %loop-header
 

>From 19afda039564809f5f7a40e688cb30354fc767cf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 1 Oct 2025 08:48:28 -0400
Subject: [PATCH 2/2] Fix SystemZ test

---
 llvm/test/CodeGen/SystemZ/llvm.sincos.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
index 97980770cc7c9..e3ed31fae70c0 100644
--- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
+++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
@@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LINUX-NEXT:    ld %f10, 8(%r3)
 ; LINUX-NEXT:    ld %f0, 16(%r3)
 ; LINUX-NEXT:    ld %f2, 24(%r3)
-; LINUX-NEXT:    la %r3, 16(%r2)
-; LINUX-NEXT:    la %r4, 48(%r2)
 ; LINUX-NEXT:    la %r2, 176(%r15)
+; LINUX-NEXT:    la %r3, 16(%r13)
+; LINUX-NEXT:    la %r4, 48(%r13)
 ; LINUX-NEXT:    std %f0, 176(%r15)
 ; LINUX-NEXT:    std %f2, 184(%r15)
 ; LINUX-NEXT:    brasl %r14, sincosl at PLT



More information about the llvm-commits mailing list