[llvm] Update amdgpu_gfx functions to use s0-s3 for inreg SGPR arguments on targets using scratch instructions for stack #78226 (PR #81394)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 29 02:11:00 PST 2024
https://github.com/SahilPatidar updated https://github.com/llvm/llvm-project/pull/81394
>From 7b2bcdb69d8301c020d0789279dde7052a26fdef Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil at 2001gmail.com>
Date: Sat, 10 Feb 2024 17:14:14 +0530
Subject: [PATCH 1/2] amdgpu_gfx functions do not use s0-s3 for inreg SGPR
arguments on targets using scratch instructions for stack #78226
---
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 13 +++++++------
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 14 +++++++-------
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 6d05c3678bf09e..4087935cc3ad32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -715,10 +715,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!IsEntryFunc && !IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
-
- if (!Subtarget.enableFlatScratch())
- CCInfo.AllocateReg(Info->getScratchRSrcReg());
- TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
IncomingValueAssigner Assigner(AssignFn);
@@ -732,9 +728,14 @@ bool AMDGPUCallLowering::lowerFormalArguments(
uint64_t StackSize = Assigner.StackSize;
// Start adding system SGPRs.
- if (IsEntryFunc)
+ if (IsEntryFunc) {
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
-
+ } else {
+ if (!Subtarget.enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ if (!IsGraphics)
+ TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+ }
// When we tail call, we need to check if the callee's arguments will fit on
// the caller's stack. So, whenever we lower formal arguments, we should keep
// track of this information, since we might lower a tail call in this
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index c5207228dc913f..4c922a81c02efd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -23,6 +23,7 @@ def CC_SI_Gfx : CallingConv<[
// 33 is reserved for the frame pointer
// 34 is reserved for the base pointer
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
+ SGPR0, SGPR1, SGPR2, SGPR3,
SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 84ef9679ab9563..7e15f10bf00a80 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2837,12 +2837,6 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
-
- // FIXME: Sink this into allocateSpecialInputSGPRs
- if (!Subtarget->enableFlatScratch())
- CCInfo.AllocateReg(Info->getScratchRSrcReg());
-
- allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
if (!IsKernel) {
@@ -3046,8 +3040,14 @@ SDValue SITargetLowering::LowerFormalArguments(
}
// Start adding system SGPRs.
- if (IsEntryFunc)
+ if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
+ } else {
+ if (!Subtarget->enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ if (!IsGraphics)
+ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+ }
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
>From 8f604002f24d736a4f0cc8bc3b73c0e5b7f03750 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil at 2001gmail.com>
Date: Tue, 13 Feb 2024 21:40:44 +0530
Subject: [PATCH 2/2] modified test
---
.../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 761 +++++++++---------
1 file changed, 402 insertions(+), 359 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index e79cb66dcd7760..823c5456b23aff 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -11,14 +11,16 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: s_mov_b32 s40, s6
-; GFX9-O0-NEXT: s_mov_b32 s34, s4
+; GFX9-O0-NEXT: s_mov_b32 s35, s3
+; GFX9-O0-NEXT: s_mov_b32 s40, s2
+; GFX9-O0-NEXT: s_mov_b32 s36, s1
+; GFX9-O0-NEXT: s_mov_b32 s34, s0
; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s7
+; GFX9-O0-NEXT: s_mov_b32 s41, s35
; GFX9-O0-NEXT: s_mov_b32 s42, s41
; GFX9-O0-NEXT: s_mov_b32 s43, s40
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT: s_mov_b32 s35, s5
+; GFX9-O0-NEXT: s_mov_b32 s35, s36
; GFX9-O0-NEXT: s_mov_b32 s44, s35
; GFX9-O0-NEXT: s_mov_b32 s36, s34
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -76,7 +78,11 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0
+; GFX9-O3-NEXT: s_mov_b32 s36, s0
+; GFX9-O3-NEXT: s_mov_b32 s37, s1
+; GFX9-O3-NEXT: s_mov_b32 s38, s2
+; GFX9-O3-NEXT: s_mov_b32 s39, s3
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v0, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
@@ -102,7 +108,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
; GFX9-O3-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v4, 1, v4
; GFX9-O3-NEXT: v_and_b32_e32 v4, 2, v4
-; GFX9-O3-NEXT: buffer_store_dword v4, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT: buffer_store_dword v4, off, s[36:39], 0 offset:4
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -150,14 +156,16 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT: s_mov_b32 s40, s6
-; GFX9-O0-NEXT: s_mov_b32 s34, s4
+; GFX9-O0-NEXT: s_mov_b32 s35, s3
+; GFX9-O0-NEXT: s_mov_b32 s40, s2
+; GFX9-O0-NEXT: s_mov_b32 s36, s1
+; GFX9-O0-NEXT: s_mov_b32 s34, s0
; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s7
+; GFX9-O0-NEXT: s_mov_b32 s41, s35
; GFX9-O0-NEXT: s_mov_b32 s42, s41
; GFX9-O0-NEXT: s_mov_b32 s43, s40
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT: s_mov_b32 s35, s5
+; GFX9-O0-NEXT: s_mov_b32 s35, s36
; GFX9-O0-NEXT: s_mov_b32 s44, s35
; GFX9-O0-NEXT: s_mov_b32 s36, s34
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -266,7 +274,15 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; GFX9-O3-NEXT: s_mov_b32 s40, s0
+; GFX9-O3-NEXT: s_mov_b32 s41, s1
+; GFX9-O3-NEXT: s_mov_b32 s42, s2
+; GFX9-O3-NEXT: s_mov_b32 s43, s3
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[3:4], off, s[40:43], 0
+; GFX9-O3-NEXT: s_mov_b32 s38, s2
+; GFX9-O3-NEXT: s_mov_b32 s39, s3
+; GFX9-O3-NEXT: s_mov_b32 s36, s0
+; GFX9-O3-NEXT: s_mov_b32 s37, s1
; GFX9-O3-NEXT: v_mov_b32_e32 v5, 0
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
@@ -285,17 +301,17 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: s_and_saveexec_b64 s[34:35], vcc
; GFX9-O3-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O3-NEXT: ; %bb.1: ; %if
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O3-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v4
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[36:37], -1
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[40:41], -1
; GFX9-O3-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O3-NEXT: v_add_u32_e32 v1, v2, v1
-; GFX9-O3-NEXT: s_mov_b64 exec, s[36:37]
+; GFX9-O3-NEXT: s_mov_b64 exec, s[40:41]
; GFX9-O3-NEXT: v_mov_b32_e32 v5, v1
; GFX9-O3-NEXT: .LBB1_2: ; %merge
; GFX9-O3-NEXT: s_or_b64 exec, exec, s[34:35]
@@ -303,7 +319,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
; GFX9-O3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-O3-NEXT: v_and_b32_e32 v0, 2, v0
-; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -376,16 +392,18 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0
-; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O0-NEXT: s_mov_b32 s40, s6
-; GFX9-O0-NEXT: s_mov_b32 s34, s4
+; GFX9-O0-NEXT: v_writelane_b32 v2, s30, 0
+; GFX9-O0-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-O0-NEXT: s_mov_b32 s35, s3
+; GFX9-O0-NEXT: s_mov_b32 s40, s2
+; GFX9-O0-NEXT: s_mov_b32 s36, s1
+; GFX9-O0-NEXT: s_mov_b32 s34, s0
; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
-; GFX9-O0-NEXT: s_mov_b32 s41, s7
+; GFX9-O0-NEXT: s_mov_b32 s41, s35
; GFX9-O0-NEXT: s_mov_b32 s42, s41
; GFX9-O0-NEXT: s_mov_b32 s43, s40
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT: s_mov_b32 s35, s5
+; GFX9-O0-NEXT: s_mov_b32 s35, s36
; GFX9-O0-NEXT: s_mov_b32 s44, s35
; GFX9-O0-NEXT: s_mov_b32 s36, s34
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
@@ -393,7 +411,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_mov_b32 s38, s43
; GFX9-O0-NEXT: s_mov_b32 s39, s42
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, s4
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s34
; GFX9-O0-NEXT: s_not_b64 exec, exec
@@ -427,7 +445,7 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-LABEL: strict_wwm_call:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT: s_mov_b32 s38, s33
+; GFX9-O3-NEXT: s_mov_b32 s42, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -436,31 +454,35 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0
; GFX9-O3-NEXT: s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-O3-NEXT: v_writelane_b32 v2, s31, 1
+; GFX9-O3-NEXT: s_mov_b32 s36, s0
+; GFX9-O3-NEXT: s_mov_b32 s37, s1
+; GFX9-O3-NEXT: s_mov_b32 s38, s2
+; GFX9-O3-NEXT: s_mov_b32 s39, s3
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s4
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called at rel32@lo+4
-; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called at rel32@hi+12
-; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v3
+; GFX9-O3-NEXT: s_getpc_b64 s[40:41]
+; GFX9-O3-NEXT: s_add_u32 s40, s40, strict_wwm_called at rel32@lo+4
+; GFX9-O3-NEXT: s_addc_u32 s41, s41, strict_wwm_called at rel32@hi+12
+; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[40:41]
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-O3-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT: v_readlane_b32 s31, v3, 1
-; GFX9-O3-NEXT: v_readlane_b32 s30, v3, 0
+; GFX9-O3-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GFX9-O3-NEXT: v_readlane_b32 s31, v2, 1
+; GFX9-O3-NEXT: v_readlane_b32 s30, v2, 0
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_addk_i32 s32, 0xfc00
-; GFX9-O3-NEXT: s_mov_b32 s33, s38
+; GFX9-O3-NEXT: s_mov_b32 s33, s42
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
@@ -581,17 +603,19 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000
; GFX9-O0-NEXT: ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0
-; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1
-; GFX9-O0-NEXT: s_mov_b32 s34, s8
-; GFX9-O0-NEXT: s_mov_b32 s38, s6
-; GFX9-O0-NEXT: s_mov_b32 s36, s4
+; GFX9-O0-NEXT: v_writelane_b32 v8, s30, 0
+; GFX9-O0-NEXT: v_writelane_b32 v8, s31, 1
+; GFX9-O0-NEXT: s_mov_b32 s34, s4
+; GFX9-O0-NEXT: s_mov_b32 s35, s3
+; GFX9-O0-NEXT: s_mov_b32 s38, s2
+; GFX9-O0-NEXT: s_mov_b32 s40, s1
+; GFX9-O0-NEXT: s_mov_b32 s36, s0
; GFX9-O0-NEXT: ; kill: def $sgpr38 killed $sgpr38 def $sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s39, s7
+; GFX9-O0-NEXT: s_mov_b32 s39, s35
; GFX9-O0-NEXT: s_mov_b32 s35, s39
; GFX9-O0-NEXT: s_mov_b32 s44, s38
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37
-; GFX9-O0-NEXT: s_mov_b32 s37, s5
+; GFX9-O0-NEXT: s_mov_b32 s37, s40
; GFX9-O0-NEXT: s_mov_b32 s45, s37
; GFX9-O0-NEXT: s_mov_b32 s40, s36
; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41_sgpr42_sgpr43
@@ -603,7 +627,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2
; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3
; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35
-; GFX9-O0-NEXT: s_mov_b32 s35, s9
+; GFX9-O0-NEXT: s_mov_b32 s35, s5
; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
; GFX9-O0-NEXT: s_mov_b64 s[36:37], 0
; GFX9-O0-NEXT: v_mov_b32_e32 v8, s34
@@ -686,7 +710,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-LABEL: strict_wwm_call_i64:
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-O3-NEXT: s_mov_b32 s40, s33
+; GFX9-O3-NEXT: s_mov_b32 s44, s33
; GFX9-O3-NEXT: s_mov_b32 s33, s32
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -703,32 +727,36 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: s_addk_i32 s32, 0x800
; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1
; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: s_getpc_b64 s[36:37]
-; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called_i64 at gotpcrel32@lo+4
-; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called_i64 at gotpcrel32@hi+12
-; GFX9-O3-NEXT: s_load_dwordx2 s[36:37], s[36:37], 0x0
+; GFX9-O3-NEXT: s_getpc_b64 s[40:41]
+; GFX9-O3-NEXT: s_add_u32 s40, s40, strict_wwm_called_i64 at gotpcrel32@lo+4
+; GFX9-O3-NEXT: s_addc_u32 s41, s41, strict_wwm_called_i64 at gotpcrel32@hi+12
+; GFX9-O3-NEXT: s_load_dwordx2 s[40:41], s[40:41], 0x0
+; GFX9-O3-NEXT: s_mov_b32 s36, s0
+; GFX9-O3-NEXT: s_mov_b32 s37, s1
+; GFX9-O3-NEXT: s_mov_b32 s38, s2
+; GFX9-O3-NEXT: s_mov_b32 s39, s3
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s8
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, s4
+; GFX9-O3-NEXT: v_mov_b32_e32 v8, s5
; GFX9-O3-NEXT: s_not_b64 exec, exec
; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
; GFX9-O3-NEXT: s_not_b64 exec, exec
-; GFX9-O3-NEXT: s_or_saveexec_b64 s[38:39], -1
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT: s_or_saveexec_b64 s[42:43], -1
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, v8
; GFX9-O3-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37]
+; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[40:41]
; GFX9-O3-NEXT: v_mov_b32_e32 v2, v0
; GFX9-O3-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
-; GFX9-O3-NEXT: s_mov_b64 exec, s[38:39]
+; GFX9-O3-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
+; GFX9-O3-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX9-O3-NEXT: s_mov_b64 exec, s[42:43]
; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O3-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT: v_readlane_b32 s31, v8, 1
-; GFX9-O3-NEXT: v_readlane_b32 s30, v8, 0
+; GFX9-O3-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], 0 offset:4
+; GFX9-O3-NEXT: v_readlane_b32 s31, v6, 1
+; GFX9-O3-NEXT: v_readlane_b32 s30, v6, 0
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -739,7 +767,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O3-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_addk_i32 s32, 0xf800
-; GFX9-O3-NEXT: s_mov_b32 s33, s40
+; GFX9-O3-NEXT: s_mov_b32 s33, s44
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
%tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
@@ -760,11 +788,14 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: s_mov_b32 s36, s4
+; GFX9-O0-NEXT: s_mov_b32 s34, s3
+; GFX9-O0-NEXT: s_mov_b32 s35, s2
+; GFX9-O0-NEXT: s_mov_b32 s40, s1
+; GFX9-O0-NEXT: s_mov_b32 s36, s0
; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39
-; GFX9-O0-NEXT: s_mov_b32 s37, s5
-; GFX9-O0-NEXT: s_mov_b32 s38, s6
-; GFX9-O0-NEXT: s_mov_b32 s39, s7
+; GFX9-O0-NEXT: s_mov_b32 s37, s40
+; GFX9-O0-NEXT: s_mov_b32 s38, s35
+; GFX9-O0-NEXT: s_mov_b32 s39, s34
; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41_sgpr42_sgpr43 killed $sgpr36_sgpr37_sgpr38_sgpr39
; GFX9-O0-NEXT: s_mov_b32 s34, 5
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0
@@ -848,9 +879,13 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
+; GFX9-O3-NEXT: s_mov_b32 s36, s0
+; GFX9-O3-NEXT: s_mov_b32 s37, s1
+; GFX9-O3-NEXT: s_mov_b32 s38, s2
+; GFX9-O3-NEXT: s_mov_b32 s39, s3
; GFX9-O3-NEXT: v_lshlrev_b32_e32 v0, 5, v0
-; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_load_dwordx4 v[7:10], v0, s[36:39], 0 offen
+; GFX9-O3-NEXT: buffer_load_dwordx2 v[11:12], v0, s[36:39], 0 offen offset:16
; GFX9-O3-NEXT: s_mov_b32 s34, -1
; GFX9-O3-NEXT: s_brev_b32 s35, -2
; GFX9-O3-NEXT: s_waitcnt vmcnt(1)
@@ -879,8 +914,8 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4
; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5
; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6
-; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[4:7], 0 offen
-; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[4:7], 0 offen offset:16
+; GFX9-O3-NEXT: buffer_store_dwordx4 v[7:10], v0, s[36:39], 0 offen
+; GFX9-O3-NEXT: buffer_store_dwordx2 v[11:12], v0, s[36:39], 0 offen offset:16
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
; GFX9-O3-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O3-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -922,270 +957,270 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0: ; %bb.0:
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_writelane_b32 v47, s64, 0
-; GFX9-O0-NEXT: v_writelane_b32 v47, s65, 1
-; GFX9-O0-NEXT: v_writelane_b32 v47, s66, 2
-; GFX9-O0-NEXT: v_writelane_b32 v47, s67, 3
-; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_writelane_b32 v32, s64, 0
+; GFX9-O0-NEXT: v_writelane_b32 v32, s65, 1
+; GFX9-O0-NEXT: v_writelane_b32 v32, s66, 2
+; GFX9-O0-NEXT: v_writelane_b32 v32, s67, 3
+; GFX9-O0-NEXT: v_writelane_b32 v32, s68, 4
+; GFX9-O0-NEXT: v_writelane_b32 v32, s69, 5
+; GFX9-O0-NEXT: v_writelane_b32 v32, s70, 6
+; GFX9-O0-NEXT: v_writelane_b32 v32, s71, 7
+; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: s_mov_b32 s34, s3
+; GFX9-O0-NEXT: s_mov_b32 s35, s2
+; GFX9-O0-NEXT: s_mov_b32 s36, s1
+; GFX9-O0-NEXT: s_mov_b32 s37, s0
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v42, s5
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s37
+; GFX9-O0-NEXT: v_mov_b32_e32 v43, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s35
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
-; GFX9-O0-NEXT: v_mov_b32_e32 v46, s25
-; GFX9-O0-NEXT: v_mov_b32_e32 v45, s26
-; GFX9-O0-NEXT: v_mov_b32_e32 v44, s27
-; GFX9-O0-NEXT: v_mov_b32_e32 v43, s28
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29
-; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v44, s25
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s26
+; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v47, s27
+; GFX9-O0-NEXT: v_mov_b32_e32 v46, s28
+; GFX9-O0-NEXT: v_mov_b32_e32 v45, s29
; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, v46
-; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, v45
-; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, v44
-; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43
; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT: v_mov_b32_e32 v25, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v26, v42
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_mov_b32_e32 v27, v46
-; GFX9-O0-NEXT: v_mov_b32_e32 v28, v45
-; GFX9-O0-NEXT: v_mov_b32_e32 v29, v44
-; GFX9-O0-NEXT: v_mov_b32_e32 v30, v43
-; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr42 killed $exec
-; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v25, v44
+; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46
+; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45
+; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44
+; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec
+; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(4)
; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38
; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: v_mov_b32_e32 v11, v0
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr34_sgpr35
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX9-O0-NEXT: s_mov_b64 s[34:35], 0
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, v10
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, v11
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: ; implicit-def: $sgpr38_sgpr39
+; GFX9-O0-NEXT: ; implicit-def: $sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71
+; GFX9-O0-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, v10
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, v11
; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v32, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v33, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v33, s38
+; GFX9-O0-NEXT: v_mov_b32_e32 v34, s39
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v34, v8
; GFX9-O0-NEXT: v_mov_b32_e32 v35, v9
; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v34, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v35, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v35, s38
+; GFX9-O0-NEXT: v_mov_b32_e32 v36, s39
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v36, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v37, v7
; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v36, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v37, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v37, s38
+; GFX9-O0-NEXT: v_mov_b32_e32 v38, s39
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v38, v4
; GFX9-O0-NEXT: v_mov_b32_e32 v39, v5
; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v39, s38
+; GFX9-O0-NEXT: v_mov_b32_e32 v40, s39
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3
; GFX9-O0-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34
-; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v41, s38
+; GFX9-O0-NEXT: v_mov_b32_e32 v42, s39
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
@@ -1207,56 +1242,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10
-; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-O0-NEXT: v_mov_b32_e32 v10, s14
-; GFX9-O0-NEXT: v_mov_b32_e32 v11, s15
-; GFX9-O0-NEXT: v_mov_b32_e32 v12, s16
-; GFX9-O0-NEXT: v_mov_b32_e32 v13, s17
-; GFX9-O0-NEXT: v_mov_b32_e32 v14, s18
-; GFX9-O0-NEXT: v_mov_b32_e32 v15, s19
-; GFX9-O0-NEXT: v_mov_b32_e32 v16, s20
-; GFX9-O0-NEXT: v_mov_b32_e32 v17, s21
-; GFX9-O0-NEXT: v_mov_b32_e32 v18, s22
-; GFX9-O0-NEXT: v_mov_b32_e32 v19, s23
-; GFX9-O0-NEXT: v_mov_b32_e32 v20, s24
-; GFX9-O0-NEXT: v_mov_b32_e32 v21, s25
-; GFX9-O0-NEXT: v_mov_b32_e32 v22, s26
-; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27
-; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28
-; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29
-; GFX9-O0-NEXT: v_readlane_b32 s67, v47, 3
-; GFX9-O0-NEXT: v_readlane_b32 s66, v47, 2
-; GFX9-O0-NEXT: v_readlane_b32 s65, v47, 1
-; GFX9-O0-NEXT: v_readlane_b32 s64, v47, 0
-; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, s37
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, s36
+; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34
+; GFX9-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-O0-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-O0-NEXT: v_mov_b32_e32 v7, s7
+; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-O0-NEXT: v_mov_b32_e32 v10, s10
+; GFX9-O0-NEXT: v_mov_b32_e32 v11, s11
+; GFX9-O0-NEXT: v_mov_b32_e32 v12, s12
+; GFX9-O0-NEXT: v_mov_b32_e32 v13, s13
+; GFX9-O0-NEXT: v_mov_b32_e32 v14, s14
+; GFX9-O0-NEXT: v_mov_b32_e32 v15, s15
+; GFX9-O0-NEXT: v_mov_b32_e32 v16, s16
+; GFX9-O0-NEXT: v_mov_b32_e32 v17, s17
+; GFX9-O0-NEXT: v_mov_b32_e32 v18, s18
+; GFX9-O0-NEXT: v_mov_b32_e32 v19, s19
+; GFX9-O0-NEXT: v_mov_b32_e32 v20, s20
+; GFX9-O0-NEXT: v_mov_b32_e32 v21, s21
+; GFX9-O0-NEXT: v_mov_b32_e32 v22, s22
+; GFX9-O0-NEXT: v_mov_b32_e32 v23, s23
+; GFX9-O0-NEXT: v_mov_b32_e32 v24, s24
+; GFX9-O0-NEXT: v_mov_b32_e32 v25, s25
+; GFX9-O0-NEXT: v_mov_b32_e32 v26, s26
+; GFX9-O0-NEXT: v_mov_b32_e32 v27, s27
+; GFX9-O0-NEXT: v_mov_b32_e32 v28, s28
+; GFX9-O0-NEXT: v_mov_b32_e32 v29, s29
+; GFX9-O0-NEXT: v_readlane_b32 s71, v32, 7
+; GFX9-O0-NEXT: v_readlane_b32 s70, v32, 6
+; GFX9-O0-NEXT: v_readlane_b32 s69, v32, 5
+; GFX9-O0-NEXT: v_readlane_b32 s68, v32, 4
+; GFX9-O0-NEXT: v_readlane_b32 s67, v32, 3
+; GFX9-O0-NEXT: v_readlane_b32 s66, v32, 2
+; GFX9-O0-NEXT: v_readlane_b32 s65, v32, 1
+; GFX9-O0-NEXT: v_readlane_b32 s64, v32, 0
+; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_mov_b64 exec, -1
-; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
@@ -1265,30 +1308,26 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3: ; %bb.0:
; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32
-; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4
-; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8
-; GFX9-O3-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:12
-; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:16
-; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20
+; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT: buffer_load_dword v30, off, s[0:3], s32
+; GFX9-O3-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4
; GFX9-O3-NEXT: v_mov_b32_e32 v32, v1
; GFX9-O3-NEXT: v_mov_b32_e32 v33, v2
; GFX9-O3-NEXT: s_not_b64 exec, exec
@@ -1329,46 +1368,50 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24
; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36
; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32
-; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_nop 0
-; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-O3-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-O3-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-O3-NEXT: v_mov_b32_e32 v5, s9
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, s10
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, s11
-; GFX9-O3-NEXT: v_mov_b32_e32 v8, s12
-; GFX9-O3-NEXT: v_mov_b32_e32 v9, s13
-; GFX9-O3-NEXT: v_mov_b32_e32 v10, s14
-; GFX9-O3-NEXT: v_mov_b32_e32 v11, s15
-; GFX9-O3-NEXT: v_mov_b32_e32 v12, s16
-; GFX9-O3-NEXT: v_mov_b32_e32 v13, s17
-; GFX9-O3-NEXT: v_mov_b32_e32 v14, s18
-; GFX9-O3-NEXT: v_mov_b32_e32 v15, s19
-; GFX9-O3-NEXT: v_mov_b32_e32 v16, s20
-; GFX9-O3-NEXT: v_mov_b32_e32 v17, s21
-; GFX9-O3-NEXT: v_mov_b32_e32 v18, s22
-; GFX9-O3-NEXT: v_mov_b32_e32 v19, s23
-; GFX9-O3-NEXT: v_mov_b32_e32 v20, s24
-; GFX9-O3-NEXT: v_mov_b32_e32 v21, s25
-; GFX9-O3-NEXT: v_mov_b32_e32 v22, s26
-; GFX9-O3-NEXT: v_mov_b32_e32 v23, s27
-; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28
-; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29
+; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-O3-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-O3-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-O3-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-O3-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-O3-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-O3-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-O3-NEXT: v_mov_b32_e32 v7, s7
+; GFX9-O3-NEXT: v_mov_b32_e32 v8, s8
+; GFX9-O3-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-O3-NEXT: v_mov_b32_e32 v10, s10
+; GFX9-O3-NEXT: v_mov_b32_e32 v11, s11
+; GFX9-O3-NEXT: v_mov_b32_e32 v12, s12
+; GFX9-O3-NEXT: v_mov_b32_e32 v13, s13
+; GFX9-O3-NEXT: v_mov_b32_e32 v14, s14
+; GFX9-O3-NEXT: v_mov_b32_e32 v15, s15
+; GFX9-O3-NEXT: v_mov_b32_e32 v16, s16
+; GFX9-O3-NEXT: v_mov_b32_e32 v17, s17
+; GFX9-O3-NEXT: v_mov_b32_e32 v18, s18
+; GFX9-O3-NEXT: v_mov_b32_e32 v19, s19
+; GFX9-O3-NEXT: v_mov_b32_e32 v20, s20
+; GFX9-O3-NEXT: v_mov_b32_e32 v21, s21
+; GFX9-O3-NEXT: v_mov_b32_e32 v22, s22
+; GFX9-O3-NEXT: v_mov_b32_e32 v23, s23
+; GFX9-O3-NEXT: v_mov_b32_e32 v24, s24
+; GFX9-O3-NEXT: v_mov_b32_e32 v25, s25
+; GFX9-O3-NEXT: v_mov_b32_e32 v26, s26
+; GFX9-O3-NEXT: v_mov_b32_e32 v27, s27
+; GFX9-O3-NEXT: v_mov_b32_e32 v28, s28
+; GFX9-O3-NEXT: v_mov_b32_e32 v29, s29
; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O3-NEXT: s_waitcnt vmcnt(0)
; GFX9-O3-NEXT: s_setpc_b64 s[30:31]
More information about the llvm-commits
mailing list