[llvm] e816c89 - Revert "InlineSpiller: Consider if all subranges are the same when avoiding redundant spills"

JP Lehr via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 2 04:34:34 PDT 2023


Author: JP Lehr
Date: 2023-10-02T06:26:33-05:00
New Revision: e816c89c8406670e516c1b20af586358748bf77e

URL: https://github.com/llvm/llvm-project/commit/e816c89c8406670e516c1b20af586358748bf77e
DIFF: https://github.com/llvm/llvm-project/commit/e816c89c8406670e516c1b20af586358748bf77e.diff

LOG: Revert "InlineSpiller: Consider if all subranges are the same when avoiding redundant spills"

This reverts commit d8127b2ba8a87a610851b9a462f2fc2526c36e37.

Added: 
    

Modified: 
    llvm/lib/CodeGen/InlineSpiller.cpp
    llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
    llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
    llvm/test/CodeGen/AMDGPU/swdev380865.ll
    llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
    llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
    llvm/test/CodeGen/Thumb2/mve-vst3.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 1636a225d672a9a..c62f3db9d321562 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -495,31 +495,6 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   return true;
 }
 
-/// Check if all subranges in \p LI and \p SLI have the same value number at \p
-/// Idx.
-static bool allSubRangeValNoSame(const LiveInterval &LI,
-                                 const LiveInterval &SLI,
-                                 const MachineInstr &MI,
-                                 const MachineRegisterInfo &MRI,
-                                 const TargetRegisterInfo &TRI, SlotIndex Idx) {
-  for (auto &SR : SLI.subranges()) {
-    VNInfo *SubVNI = SR.getVNInfoAt(Idx);
-
-    for (auto &SubLI : LI.subranges()) {
-      if (SubLI.LaneMask == SR.LaneMask) {
-        if (SubVNI != SubLI.getVNInfoAt(Idx))
-          return false;
-      } else if ((SubLI.LaneMask & SR.LaneMask).any()) {
-        // TODO: Check non-exact, overlapping subranges if they share the same
-        // def instruction
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
 /// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
 /// redundant spills of this value in SLI.reg and sibling copies.
 void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
@@ -549,13 +524,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
       if (!MI.mayStore() && !TII.isCopyInstr(MI))
         continue;
       SlotIndex Idx = LIS.getInstructionIndex(MI);
-
-      // The main range value numbers will 
diff er if multiple instructions are
-      // used to define its various subregisters. Check the subregister value
-      // numbers as a fallback.
-      if (LI->getVNInfoAt(Idx) != VNI &&
-          (!SLI.hasSubRanges() ||
-           !allSubRangeValNoSame(*LI, SLI, MI, MRI, TRI, Idx)))
+      if (LI->getVNInfoAt(Idx) != VNI)
         continue;
 
       // Follow sibling copies down the dominator tree.

diff  --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index 857784282528bc7..7209d160e6c8a7a 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -47,7 +47,7 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
+  ; GCN-NEXT:   liveins: $exec:0x000000000000000F, $sgpr30, $sgpr31, $vgpr0:0x0000000000000003, $vgpr1:0x0000000000000003, $vgpr2:0x0000000000000003, $vgpr3:0x0000000000000003, $vgpr4:0x0000000000000003, $vgpr5:0x0000000000000003, $vgpr6:0x0000000000000003, $vgpr7:0x0000000000000003, $vgpr8:0x0000000000000003, $vgpr9:0x0000000000000003, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   renamable $vgpr57 = COPY $vgpr9, implicit $exec
   ; GCN-NEXT:   renamable $vgpr56 = COPY $vgpr8, implicit $exec
@@ -62,15 +62,17 @@ body:             |
   ; GCN-NEXT:   renamable $sgpr16_sgpr17 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.1, addrspace 5)
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5)
+  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit killed $vgpr10_vgpr11 :: (store (s32) into %stack.2 + 4, addrspace 5)
   ; GCN-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0
-  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2, addrspace 5)
-  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.2 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr14 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1, addrspace 5)
+  ; GCN-NEXT:   $vgpr15 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15 :: (load (s32) from %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
-  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.1 + 4, addrspace 5)
+  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 64, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.2 + 4, addrspace 5)
   ; GCN-NEXT:   FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:

diff  --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 9955aca981518df..08db1e7fee259d6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10093,7 +10093,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10103,7 +10103,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10112,7 +10112,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10121,7 +10121,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10130,7 +10130,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10139,7 +10139,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x82800
+; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10148,7 +10148,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x82400
+; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10157,7 +10157,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x82000
+; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10166,7 +10166,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10175,7 +10175,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x81800
+; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10184,7 +10184,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x81400
+; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10218,7 +10218,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s2, 0x81000
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10245,7 +10245,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s12, 0x84000
+; GFX6-NEXT:    s_mov_b32 s12, 0x83c00
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
@@ -10273,183 +10273,197 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s8, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s9, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s10, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s11, 3
-; GFX6-NEXT:    v_writelane_b32 v0, s12, 4
-; GFX6-NEXT:    v_writelane_b32 v0, s13, 5
-; GFX6-NEXT:    v_writelane_b32 v0, s14, 6
-; GFX6-NEXT:    v_writelane_b32 v0, s15, 7
-; GFX6-NEXT:    s_mov_b32 s36, 0x80800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s8, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s9, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s10, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s11, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s12, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
+; GFX6-NEXT:    s_mov_b32 s38, 0x84400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x84000
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x83c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s8, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s9, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s10, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s11, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s12, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s13, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s14, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s15, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s10, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s11, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s12, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s13, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s14, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s16, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s17, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s18, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s19, 3
-; GFX6-NEXT:    v_writelane_b32 v0, s20, 4
-; GFX6-NEXT:    v_writelane_b32 v0, s21, 5
-; GFX6-NEXT:    v_writelane_b32 v0, s22, 6
-; GFX6-NEXT:    v_writelane_b32 v0, s23, 7
-; GFX6-NEXT:    s_mov_b32 s36, 0x84800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s16, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s17, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s18, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s19, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s20, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
+; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x80800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x84400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s16, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s17, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s18, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s19, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s20, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s21, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s22, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s23, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s18, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s19, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s20, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s21, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s22, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s24, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s25, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s26, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s27, 3
-; GFX6-NEXT:    v_writelane_b32 v0, s28, 4
-; GFX6-NEXT:    v_writelane_b32 v0, s29, 5
-; GFX6-NEXT:    v_writelane_b32 v0, s30, 6
-; GFX6-NEXT:    v_writelane_b32 v0, s31, 7
-; GFX6-NEXT:    s_mov_b32 s36, 0x85000
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s24, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s25, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s26, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s27, 3
+; GFX6-NEXT:    v_writelane_b32 v4, s28, 4
+; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
+; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
+; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
+; GFX6-NEXT:    s_mov_b32 s38, 0x85400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s36, 0x84800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s24, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s25, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s26, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s27, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s28, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s29, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s30, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s31, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s26, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s27, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s28, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s29, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s30, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    v_writelane_b32 v4, s0, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
+; GFX6-NEXT:    s_mov_b32 s38, 0x85c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s4, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s5, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s6, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s7, 3
-; GFX6-NEXT:    s_mov_b32 s36, 0x85800
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s36 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
+; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
+; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s2, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s3, 1
-; GFX6-NEXT:    s_mov_b32 s4, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
+; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
+; GFX6-NEXT:    s_mov_b32 s4, 0x86400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    s_mov_b64 s[36:37], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x85000
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s38, 0x85400
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v0, 3
-; GFX6-NEXT:    v_readlane_b32 s4, v0, 4
-; GFX6-NEXT:    v_readlane_b32 s5, v0, 5
-; GFX6-NEXT:    v_readlane_b32 s6, v0, 6
-; GFX6-NEXT:    v_readlane_b32 s7, v0, 7
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s4, v4, 4
+; GFX6-NEXT:    v_readlane_b32 s5, v4, 5
+; GFX6-NEXT:    v_readlane_b32 s6, v4, 6
+; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x2160
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s36, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s37, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s38, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s39, v0, 3
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s38, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0x2170
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, v1, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s34, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s35, v0, 1
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s34, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s35, v4, 1
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -10458,31 +10472,44 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x85e00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_mov_b32 s6, 0x85c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v0, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v0, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v0, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v0, 3
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x84400
+; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(4)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, v17
 ; GFX6-NEXT:    v_mov_b32_e32 v1, v18
 ; GFX6-NEXT:    v_mov_b32_e32 v2, v19
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v20
+; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_mov_b32 s2, 0x84800
 ; GFX6-NEXT:    v_mov_b32_e32 v20, v3
 ; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x84000
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    v_mov_b32_e32 v19, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v17, v0
@@ -10518,14 +10545,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x83800
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
 ; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83800
+; GFX6-NEXT:    s_mov_b32 s4, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10533,7 +10560,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83400
+; GFX6-NEXT:    s_mov_b32 s4, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10541,7 +10568,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83000
+; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10549,7 +10576,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10557,7 +10584,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82800
+; GFX6-NEXT:    s_mov_b32 s4, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10565,7 +10592,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82400
+; GFX6-NEXT:    s_mov_b32 s4, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10573,7 +10600,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82000
+; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10581,7 +10608,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s4, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10589,7 +10616,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81800
+; GFX6-NEXT:    s_mov_b32 s4, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10597,7 +10624,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81400
+; GFX6-NEXT:    s_mov_b32 s4, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10605,7 +10632,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
 ; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81000
+; GFX6-NEXT:    s_mov_b32 s4, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80
 ; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64
@@ -10637,19 +10664,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:240
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:192
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:160
@@ -10658,7 +10685,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:112
@@ -10719,24 +10746,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    s_and_saveexec_b64 s[34:35], vcc
 ; GFX9-FLATSCR-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX9-FLATSCR-NEXT:  ; %bb.1: ; %bb0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, v16
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v17
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, v18
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v3, v19
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2100
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_nop 0
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v19, v3
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v18, v2
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v17, v1
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
@@ -10752,18 +10781,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s37
 ; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, s36, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:240
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:208
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:192
@@ -10773,7 +10802,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:176
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:160
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
@@ -10868,6 +10897,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX10-FLATSCR-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX10-FLATSCR-NEXT:  ; %bb.1: ; %bb0
+; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
+; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
+; GFX10-FLATSCR-NEXT:    ;;#ASMEND
+; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v59
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v63
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v58
@@ -10877,6 +10910,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v61
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v60
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v35
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v39
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v34
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v33
@@ -10925,9 +10959,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v30
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v29
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
-; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
-; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v33
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v53
@@ -10959,7 +10990,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v60
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v65
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v66
 ; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v67

diff  --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 70687e15947c1d5..7201ffaf561662a 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -16,63 +16,88 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    ; kill: killed $sgpr0_sgpr1
+; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
+; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
 ; CHECK-NEXT:    s_mov_b32 s1, 0x40180000
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
 ; CHECK-NEXT:    v_writelane_b32 v2, s0, 1
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 2
-; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
+; CHECK-NEXT:    s_mov_b32 s1, 0x40220000
 ; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
-; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
+; CHECK-NEXT:    s_mov_b32 s1, 0x40260000
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
+; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
+; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
 ; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
 ; CHECK-NEXT:    s_mov_b32 s1, s7
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s0, s6
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 6
+; CHECK-NEXT:    s_mov_b32 s0, s2
+; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
+; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
+; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
+; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
+; CHECK-NEXT:    s_mov_b32 s6, s2
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 7
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s6, s0
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 9
-; CHECK-NEXT:    s_mov_b32 s6, s0
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
 ; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
+; CHECK-NEXT:    s_mov_b32 s3, s1
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s1, s3
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    s_mov_b32 s6, 0
-; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s0, s6
-; CHECK-NEXT:    s_mov_b32 s2, s6
-; CHECK-NEXT:    s_mov_b32 s4, s6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
+; CHECK-NEXT:    v_readlane_b32 s1, v2, 6
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    s_mov_b32 s3, s1
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s1, s3
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 7
+; CHECK-NEXT:    v_readlane_b32 s1, v2, 8
+; CHECK-NEXT:    s_mov_b32 s3, s1
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s1, s3
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 5
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    s_mov_b32 s4, s0
+; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
+; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
 ; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 5
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
-; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
+; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0

diff  --git a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
index d30fb39f874110c..67f4dd72ea0b2db 100644
--- a/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
+++ b/llvm/test/CodeGen/Hexagon/regalloc-bad-undef.mir
@@ -153,8 +153,8 @@ body:             |
     %13 = S2_asl_r_p_acc %13, %47, %8.isub_lo
     %51 = A2_tfrpi 0
 
-    ; CHECK: $d0 = S2_extractup undef renamable $d0, 6, 25
-    ; CHECK: $d1 = A2_tfrpi 2
+    ; CHECK: $d2 = S2_extractup undef renamable $d0, 6, 25
+    ; CHECK: $d0 = A2_tfrpi 2
     ; CHECK: $d13 = A2_tfrpi -1
     ; CHECK-NOT: undef $r4
 

diff  --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index a0ac03e9e862f22..45bb70ec44b737a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -1024,8 +1024,10 @@ middle.block:                                     ; preds = %vector.body
 define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
 ; CHECK-LABEL: DCT_mve7:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r11, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #72
@@ -1072,6 +1074,7 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vmov q5, q2
 ; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmov q6, q2
 ; CHECK-NEXT:    vmov q1, q2
 ; CHECK-NEXT:    mov r12, r7
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
@@ -1080,16 +1083,20 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r12
-; CHECK-NEXT:    adds r6, r3, r5
+; CHECK-NEXT:    add.w r10, r3, r5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
 ; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT:    add.w r11, r6, r5
+; CHECK-NEXT:    add.w r11, r10, r5
 ; CHECK-NEXT:    sub.w r12, r12, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q5, q0, q7
-; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    vldrwt.u32 q0, [r10]
 ; CHECK-NEXT:    add.w r6, r11, r5
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q6, q0, q7
+; CHECK-NEXT:    vldrwt.u32 q0, [r11]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
@@ -1171,7 +1178,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #72
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r11, pc}
+; CHECK-NEXT:    add sp, #4
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
   %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
   %i = load i32, ptr %NumInputs, align 4
@@ -1346,6 +1354,7 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    adds r1, r0, #1
 ; CHECK-NEXT:    mov r3, r12
 ; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov q6, q3
 ; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q2, q3
@@ -1358,43 +1367,46 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    vctp.32 r10
 ; CHECK-NEXT:    add.w r11, r3, r6
-; CHECK-NEXT:    vpsttt
-; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
+; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
-; CHECK-NEXT:    vldrwt.u32 q1, [r11]
+; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
 ; CHECK-NEXT:    add.w r5, r11, r6
+; CHECK-NEXT:    sub.w r10, r10, #4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vfmat.f32 q6, q1, q0
+; CHECK-NEXT:    vldrwt.u32 q1, [r11]
+; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q6, q5
-; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q7, q1, q0
+; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmov q4, q2
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
 ; CHECK-NEXT:    adds r7, r5, r6
-; CHECK-NEXT:    adds r5, r7, r6
-; CHECK-NEXT:    sub.w r10, r10, #4
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r5, r6
+; CHECK-NEXT:    adds r5, r7, r6
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
+; CHECK-NEXT:    adds r7, r5, r6
 ; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q2, q4
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q2, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r7]
-; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    adds r5, r7, r6
+; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vpstt
 ; CHECK-NEXT:    vfmat.f32 q4, q1, q0
 ; CHECK-NEXT:    vldrwt.u32 q1, [r5]
-; CHECK-NEXT:    vmov q3, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    add r5, r6
 ; CHECK-NEXT:    vpstt

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index d5f472be0432165..d80dd5a673e20f2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -1077,58 +1077,63 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
-; CHECK-NEXT:    vmov.f32 s25, s1
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
+; CHECK-NEXT:    vmov.f32 s24, s9
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s26, s6
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s24, s9
+; CHECK-NEXT:    vmov.f32 s27, s10
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s26, s6
+; CHECK-NEXT:    vmov.f32 s25, s1
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s27, s10
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vmov.f32 s14, s0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s13, s8
 ; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s13, s8
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s24, s2
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vmov.f32 s0, s16
-; CHECK-NEXT:    vmov.f32 s1, s28
-; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vmov.f32 s25, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s13, s1
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s4, s16
+; CHECK-NEXT:    vmov.f32 s5, s28
+; CHECK-NEXT:    vmov.f32 s7, s17
+; CHECK-NEXT:    vmov.f32 s1, s19
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s2, s31
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s26, s11
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s15, s30
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q6, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s30, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
 ; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s18, s10
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s28, s8
 ; CHECK-NEXT:    vmov.f32 s31, s9
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s12, s29
 ; CHECK-NEXT:    vmov.f32 s29, s4
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #160]
@@ -1143,14 +1148,14 @@ define void @vst3_v16f32(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    vmov.f32 s8, s1
 ; CHECK-NEXT:    vmov.f32 s11, s2
 ; CHECK-NEXT:    vmov.f32 s22, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s7, s9
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s9, s21
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s21, s27
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]


        


More information about the llvm-commits mailing list