[llvm] d8127b2 - InlineSpiller: Consider if all subranges are the same when avoiding redundant spills

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 1 01:38:37 PDT 2023


Author: Matt Arsenault
Date: 2023-10-01T11:37:53+03:00
New Revision: d8127b2ba8a87a610851b9a462f2fc2526c36e37

URL: https://github.com/llvm/llvm-project/commit/d8127b2ba8a87a610851b9a462f2fc2526c36e37
DIFF: https://github.com/llvm/llvm-project/commit/d8127b2ba8a87a610851b9a462f2fc2526c36e37.diff

LOG: InlineSpiller: Consider if all subranges are the same when avoiding redundant spills

This avoids some redundant spills of subranges, and avoids a compile failure.
This greatly reduces the numbers of spills in a loop.

The main range is not informative when multiple instructions are needed to fully define
a register. A common scenario is a lowered reg_sequence where every subregister
is sequentially defined, but each def changes the main range's value number. If
we look at specific lanes at the use index, we can see the value is actually the
same.

In this testcase, there are a large number of materialized 64-bit constant defs
which are hoisted outside of the loop by MachineLICM. These are feeding REG_SEQUENCES,
which is not considered rematerializable inside the loop. After coalescing, the split
constant defs produce main ranges with an apparent phi def. There's no phi def if you look
at each individual subrange, and only half of the register is really redefined to a constant.

Fixes: SWDEV-380865

https://reviews.llvm.org/D147079

Added: 
    

Modified: 
    llvm/lib/CodeGen/InlineSpiller.cpp
    llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/swdev380865.ll
    llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
    llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c62f3db9d321562..1636a225d672a9a 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -495,6 +495,31 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   return true;
 }
 
+/// Check if all subranges in \p LI and \p SLI have the same value number at \p
+/// Idx.
+static bool allSubRangeValNoSame(const LiveInterval &LI,
+                                 const LiveInterval &SLI,
+                                 const MachineInstr &MI,
+                                 const MachineRegisterInfo &MRI,
+                                 const TargetRegisterInfo &TRI, SlotIndex Idx) {
+  for (auto &SR : SLI.subranges()) {
+    VNInfo *SubVNI = SR.getVNInfoAt(Idx);
+
+    for (auto &SubLI : LI.subranges()) {
+      if (SubLI.LaneMask == SR.LaneMask) {
+        if (SubVNI != SubLI.getVNInfoAt(Idx))
+          return false;
+      } else if ((SubLI.LaneMask & SR.LaneMask).any()) {
+        // TODO: Check non-exact, overlapping subranges if they share the same
+        // def instruction
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
 /// redundant spills of this value in SLI.reg and sibling copies.
 void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
@@ -524,7 +549,13 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
       if (!MI.mayStore() && !TII.isCopyInstr(MI))
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
-      if (LI->getVNInfoAt(Idx) != VNI)
+
+      // The main range value numbers will 
diff er if multiple instructions are
+      // used to define its various subregisters. Check the subregister value
+      // numbers as a fallback.
+      if (LI->getVNInfoAt(Idx) != VNI &&
+          (!SLI.hasSubRanges() ||
+           !allSubRangeValNoSame(*LI, SLI, MI, MRI, TRI, Idx)))
         continue;
 
       // Follow sibling copies down the dominator tree.

diff  --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index 7209d160e6c8a7a..857784282528bc7 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -47,7 +47,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
+  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr57 = COPY $vgpr9, implicit $exec
   ; GCN-NEXT:   renamable $vgpr56 = COPY $vgpr8, implicit $exec
@@ -62,17 +62,15 @@ body:             |
   ; GCN-NEXT:   renamable $sgpr16_sgpr17 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.2 + 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.1, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0
-  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1, addrspace 5)
-  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2, addrspace 5)
+  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2 + 4, addrspace 5)
   ; GCN-NEXT:   renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
-  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 08db1e7fee259d6..9955aca981518df 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10093,7 +10093,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10103,7 +10103,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10112,7 +10112,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10121,7 +10121,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10130,7 +10130,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x82800
+; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10139,7 +10139,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x82400
+; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10148,7 +10148,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x82000
+; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10157,7 +10157,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10166,7 +10166,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x81800
+; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10175,7 +10175,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x81400
+; GFX6-NEXT:    s_mov_b32 s2, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10184,7 +10184,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x81000
+; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10218,7 +10218,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10245,7 +10245,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s12, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s12, 0x84000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10273,197 +10273,183 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s8, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s9, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s10, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s11, 3
-; GFX6-NEXT:    v_writelane_b32 v4, s12, 4
-; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
-; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
-; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v0, s8, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s9, 1
+; GFX6-NEXT:    v_writelane_b32 v0, s10, 2
+; GFX6-NEXT:    v_writelane_b32 v0, s11, 3
+; GFX6-NEXT:    v_writelane_b32 v0, s12, 4
+; GFX6-NEXT:    v_writelane_b32 v0, s13, 5
+; GFX6-NEXT:    v_writelane_b32 v0, s14, 6
+; GFX6-NEXT:    v_writelane_b32 v0, s15, 7
+; GFX6-NEXT:    s_mov_b32 s36, 0x80800
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x83c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s36, 0x84000
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s10, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s11, v4, 3
-; GFX6-NEXT:    v_readlane_b32 s12, v4, 4
-; GFX6-NEXT:    v_readlane_b32 s13, v4, 5
-; GFX6-NEXT:    v_readlane_b32 s14, v4, 6
-; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s8, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s9, v0, 1
+; GFX6-NEXT:    v_readlane_b32 s10, v0, 2
+; GFX6-NEXT:    v_readlane_b32 s11, v0, 3
+; GFX6-NEXT:    v_readlane_b32 s12, v0, 4
+; GFX6-NEXT:    v_readlane_b32 s13, v0, 5
+; GFX6-NEXT:    v_readlane_b32 s14, v0, 6
+; GFX6-NEXT:    v_readlane_b32 s15, v0, 7
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s16, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s17, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s18, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s19, 3
-; GFX6-NEXT:    v_writelane_b32 v4, s20, 4
-; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
-; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
-; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v0, s16, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s17, 1
+; GFX6-NEXT:    v_writelane_b32 v0, s18, 2
+; GFX6-NEXT:    v_writelane_b32 v0, s19, 3
+; GFX6-NEXT:    v_writelane_b32 v0, s20, 4
+; GFX6-NEXT:    v_writelane_b32 v0, s21, 5
+; GFX6-NEXT:    v_writelane_b32 v0, s22, 6
+; GFX6-NEXT:    v_writelane_b32 v0, s23, 7
+; GFX6-NEXT:    s_mov_b32 s36, 0x84800
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s36, 0x80800
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s18, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s19, v4, 3
-; GFX6-NEXT:    v_readlane_b32 s20, v4, 4
-; GFX6-NEXT:    v_readlane_b32 s21, v4, 5
-; GFX6-NEXT:    v_readlane_b32 s22, v4, 6
-; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s16, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s17, v0, 1
+; GFX6-NEXT:    v_readlane_b32 s18, v0, 2
+; GFX6-NEXT:    v_readlane_b32 s19, v0, 3
+; GFX6-NEXT:    v_readlane_b32 s20, v0, 4
+; GFX6-NEXT:    v_readlane_b32 s21, v0, 5
+; GFX6-NEXT:    v_readlane_b32 s22, v0, 6
+; GFX6-NEXT:    v_readlane_b32 s23, v0, 7
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s24, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s25, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s26, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s27, 3
-; GFX6-NEXT:    v_writelane_b32 v4, s28, 4
-; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
-; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
-; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x85400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v0, s24, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s25, 1
+; GFX6-NEXT:    v_writelane_b32 v0, s26, 2
+; GFX6-NEXT:    v_writelane_b32 v0, s27, 3
+; GFX6-NEXT:    v_writelane_b32 v0, s28, 4
+; GFX6-NEXT:    v_writelane_b32 v0, s29, 5
+; GFX6-NEXT:    v_writelane_b32 v0, s30, 6
+; GFX6-NEXT:    v_writelane_b32 v0, s31, 7
+; GFX6-NEXT:    s_mov_b32 s36, 0x85000
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s36, 0x84800
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s26, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s27, v4, 3
-; GFX6-NEXT:    v_readlane_b32 s28, v4, 4
-; GFX6-NEXT:    v_readlane_b32 s29, v4, 5
-; GFX6-NEXT:    v_readlane_b32 s30, v4, 6
-; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s24, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s25, v0, 1
+; GFX6-NEXT:    v_readlane_b32 s26, v0, 2
+; GFX6-NEXT:    v_readlane_b32 s27, v0, 3
+; GFX6-NEXT:    v_readlane_b32 s28, v0, 4
+; GFX6-NEXT:    v_readlane_b32 s29, v0, 5
+; GFX6-NEXT:    v_readlane_b32 s30, v0, 6
+; GFX6-NEXT:    v_readlane_b32 s31, v0, 7
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
-; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s0, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s38, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
-; GFX6-NEXT:    s_mov_b32 s36, 0x86000
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v0, s4, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s5, 1
+; GFX6-NEXT:    v_writelane_b32 v0, s6, 2
+; GFX6-NEXT:    v_writelane_b32 v0, s7, 3
+; GFX6-NEXT:    s_mov_b32 s36, 0x85800
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
-; GFX6-NEXT:    s_mov_b32 s4, 0x86400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v0, s2, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s3, 1
+; GFX6-NEXT:    s_mov_b32 s4, 0x85c00
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x85400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x85000
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
-; GFX6-NEXT:    v_readlane_b32 s4, v4, 4
-; GFX6-NEXT:    v_readlane_b32 s5, v4, 5
-; GFX6-NEXT:    v_readlane_b32 s6, v4, 6
-; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v0, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v0, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v0, 3
+; GFX6-NEXT:    v_readlane_b32 s4, v0, 4
+; GFX6-NEXT:    v_readlane_b32 s5, v0, 5
+; GFX6-NEXT:    v_readlane_b32 s6, v0, 6
+; GFX6-NEXT:    v_readlane_b32 s7, v0, 7
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0x2160
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s38, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s36, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s37, v0, 1
+; GFX6-NEXT:    v_readlane_b32 s38, v0, 2
+; GFX6-NEXT:    v_readlane_b32 s39, v0, 3
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0x2170
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s34, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s35, v4, 1
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s34, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s35, v0, 1
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -10472,44 +10458,31 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x85e00
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v0, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v0, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v0, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v0, 3
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_mov_b32 s2, 0x84400
-; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(4)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v17
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v18
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v19
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v20
-; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_mov_b32 s2, 0x84800
 ; GFX6-NEXT:    v_mov_b32_e32 v20, v3
 ; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x84000
 ; GFX6-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v17, v0
@@ -10545,14 +10518,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83800
+; GFX6-NEXT:    s_mov_b32 s4, 0x83c00
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
 ; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83400
+; GFX6-NEXT:    s_mov_b32 s4, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10560,7 +10533,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83000
+; GFX6-NEXT:    s_mov_b32 s4, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10568,7 +10541,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10576,7 +10549,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82800
+; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10584,7 +10557,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82400
+; GFX6-NEXT:    s_mov_b32 s4, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10592,7 +10565,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82000
+; GFX6-NEXT:    s_mov_b32 s4, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10600,7 +10573,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10608,7 +10581,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81800
+; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10616,7 +10589,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81400
+; GFX6-NEXT:    s_mov_b32 s4, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10624,7 +10597,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81000
+; GFX6-NEXT:    s_mov_b32 s4, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10632,7 +10605,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80
 ; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64
@@ -10664,19 +10637,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:240
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:160
@@ -10685,7 +10658,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:112
@@ -10746,26 +10719,24 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, v16
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2100
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_nop 0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v17
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, v18
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v3, v19
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v19, v3
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v18, v2
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v17, v1
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
@@ -10781,18 +10752,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s37
 ; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, s36, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:240
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:208
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:192
@@ -10802,7 +10773,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:176
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
@@ -10897,10 +10868,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX10-FLATSCR-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX10-FLATSCR-NEXT:  ; %bb.1: ; %bb0
-; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
-; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
-; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v59
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v63
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v58
@@ -10910,7 +10877,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v61
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v60
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v35
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v34
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v33
@@ -10959,6 +10925,9 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v30
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v29
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
+; GFX10-FLATSCR-NEXT:    ;;#ASMEND
+; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v33
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v53
@@ -10990,6 +10959,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v60
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v65
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v66
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v67

diff  --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 7201ffaf561662a..70687e15947c1d5 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -16,88 +16,63 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
 ; CHECK-NEXT:    ; kill: killed $sgpr0_sgpr1
-; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
-; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40180000
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
 ; CHECK-NEXT:    v_writelane_b32 v2, s0, 1
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 2
-; CHECK-NEXT:    s_mov_b32 s1, 0x40220000
+; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
 ; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    s_mov_b32 s1, 0x40260000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
+; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s2, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
 ; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s1, s7
-; CHECK-NEXT:    s_mov_b32 s0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
-; CHECK-NEXT:    s_mov_b32 s6, s2
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s0, s6
+; CHECK-NEXT:    v_readlane_b32 s6, v2, 6
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
-; CHECK-NEXT:    s_mov_b32 s3, s1
 ; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    v_readlane_b32 s7, v2, 7
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
+; CHECK-NEXT:    s_mov_b32 s6, s0
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 7
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 8
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
-; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
+; CHECK-NEXT:    v_readlane_b32 s6, v2, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v2, 9
+; CHECK-NEXT:    s_mov_b32 s6, s0
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
+; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s0, s6
+; CHECK-NEXT:    s_mov_b32 s2, s6
+; CHECK-NEXT:    s_mov_b32 s4, s6
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    v_readlane_b32 s2, v2, 5
 ; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 5
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
 ; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0

diff  --git a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
index 67f4dd72ea0b2db..d30fb39f874110c 100644
--- a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
+++ b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
@@ -153,8 +153,8 @@ body:             |
     %13 = S2_asl_r_p_acc %13, %47, %8.isub_lo
     %51 = A2_tfrpi 0
 
-    ; CHECK: $d2 = S2_extractup undef renamable $d0, 6, 25
-    ; CHECK: $d0 = A2_tfrpi 2
+    ; CHECK: $d0 = S2_extractup undef renamable $d0, 6, 25
+    ; CHECK: $d1 = A2_tfrpi 2
     ; CHECK: $d13 = A2_tfrpi -1
     ; CHECK-NOT: undef $r4
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index 45bb70ec44b737a..a0ac03e9e862f22 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -1024,10 +1024,8 @@ middle.block:                                     ; preds = %vector.body
 define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
 ; CHECK-LABEL: DCT_mve7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT:    .pad #4
-; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r11, lr}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #72
@@ -1074,7 +1072,6 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmov q6, q2
 ; CHECK-NEXT:    vmov q1, q2
 ; CHECK-NEXT:    mov r12, r7
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
@@ -1083,20 +1080,16 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r12
-; CHECK-NEXT:    add.w r10, r3, r5
+; CHECK-NEXT:    adds r6, r3, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT:    add.w r11, r10, r5
+; CHECK-NEXT:    add.w r11, r6, r5
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r10]
-; CHECK-NEXT:    add.w r6, r11, r5
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q6, q0, q7
 ; CHECK-NEXT:    vldrwt.u32 q0, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
@@ -1178,8 +1171,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #72
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    add sp, #4
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
 entry:
   %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
   %i = load i32, ptr %NumInputs, align 4
@@ -1354,7 +1346,6 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    adds r1, r0, #1
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q2, q3
@@ -1367,46 +1358,43 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r10
 ; CHECK-NEXT:    add.w r11, r3, r6
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
+; CHECK-NEXT:    vpsttt
 ; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
-; CHECK-NEXT:    add.w r5, r11, r6
-; CHECK-NEXT:    sub.w r10, r10, #4
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vfmat.f32 q6, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
 ; CHECK-NEXT:    vldrwt.u32 q1, [r11]
-; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT:    add.w r5, r11, r6
 ; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
-; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r5, r6
+; CHECK-NEXT:    adds r5, r7, r6
+; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
-; CHECK-NEXT:    adds r5, r7, r6
+; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    adds r5, r7, r6
-; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    add r5, r6
 ; CHECK-NEXT:    vpstt

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index d80dd5a673e20f2..d5f472be0432165 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -1077,63 +1077,58 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
-; CHECK-NEXT:    vmov.f32 s24, s9
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
-; CHECK-NEXT:    vmov.f32 s26, s6
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s27, s10
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s24, s9
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vmov.f32 s26, s6
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s24, s2
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vmov.f32 s27, s10
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s13, s8
+; CHECK-NEXT:    vmov.f32 s15, s5
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vmov.f32 s27, s3
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s0, s16
+; CHECK-NEXT:    vmov.f32 s1, s28
+; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vmov.f32 s4, s16
-; CHECK-NEXT:    vmov.f32 s5, s28
-; CHECK-NEXT:    vmov.f32 s7, s17
-; CHECK-NEXT:    vmov.f32 s1, s19
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s2, s31
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s26, s11
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s15, s30
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s30, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
 ; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s18, s10
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s28, s8
 ; CHECK-NEXT:    vmov.f32 s31, s9
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s12, s29
 ; CHECK-NEXT:    vmov.f32 s29, s4
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #160]
@@ -1148,14 +1143,14 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmov.f32 s8, s1
 ; CHECK-NEXT:    vmov.f32 s11, s2
 ; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s7, s9
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s9, s21
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s21, s27
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]


        


More information about the llvm-commits mailing list