[llvm] 0fd31b2 - [AMDGPU] Place returns on stack if they would violate VGPR limit

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 14 22:07:14 PDT 2023


Author: Carl Ritson
Date: 2023-06-15T14:05:32+09:00
New Revision: 0fd31b2880b32113e3218f14d8e7ae651187a55d

URL: https://github.com/llvm/llvm-project/commit/0fd31b2880b32113e3218f14d8e7ae651187a55d
DIFF: https://github.com/llvm/llvm-project/commit/0fd31b2880b32113e3218f14d8e7ae651187a55d.diff

LOG: [AMDGPU] Place returns on stack if they would violate VGPR limit

Check no VGPRs above configured maximum would be used by a return
when deciding if it can be lowered.

Reviewed By: sebastian-ne

Differential Revision: https://reviews.llvm.org/D152912

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0bf8f66c84205..28a3b87630eea 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2700,7 +2700,17 @@ bool SITargetLowering::CanLowerReturn(
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+  if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
+    return false;
+
+  // We must use the stack if return would require unavailable registers.
+  unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
+  unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+  for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
+    if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
+      return false;
+
+  return true;
 }
 
 SDValue

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 7d4a5d35fa687..e7a7578b5cea3 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -450,7 +450,6 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX9-LABEL: return_100xi32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v99, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -479,41 +478,6 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v25, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v26, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v27, 0
-; GFX9-NEXT:    v_mov_b32_e32 v64, 0
-; GFX9-NEXT:    v_mov_b32_e32 v65, 0
-; GFX9-NEXT:    v_mov_b32_e32 v66, 0
-; GFX9-NEXT:    v_mov_b32_e32 v67, 0
-; GFX9-NEXT:    v_mov_b32_e32 v68, 0
-; GFX9-NEXT:    v_mov_b32_e32 v69, 0
-; GFX9-NEXT:    v_mov_b32_e32 v70, 0
-; GFX9-NEXT:    v_mov_b32_e32 v71, 0
-; GFX9-NEXT:    v_mov_b32_e32 v72, 0
-; GFX9-NEXT:    v_mov_b32_e32 v73, 0
-; GFX9-NEXT:    v_mov_b32_e32 v74, 0
-; GFX9-NEXT:    v_mov_b32_e32 v75, 0
-; GFX9-NEXT:    v_mov_b32_e32 v76, 0
-; GFX9-NEXT:    v_mov_b32_e32 v77, 0
-; GFX9-NEXT:    v_mov_b32_e32 v78, 0
-; GFX9-NEXT:    v_mov_b32_e32 v79, 0
-; GFX9-NEXT:    v_mov_b32_e32 v80, 0
-; GFX9-NEXT:    v_mov_b32_e32 v81, 0
-; GFX9-NEXT:    v_mov_b32_e32 v82, 0
-; GFX9-NEXT:    v_mov_b32_e32 v83, 0
-; GFX9-NEXT:    v_mov_b32_e32 v84, 0
-; GFX9-NEXT:    v_mov_b32_e32 v85, 0
-; GFX9-NEXT:    v_mov_b32_e32 v86, 0
-; GFX9-NEXT:    v_mov_b32_e32 v87, 0
-; GFX9-NEXT:    v_mov_b32_e32 v88, 0
-; GFX9-NEXT:    v_mov_b32_e32 v89, 0
-; GFX9-NEXT:    v_mov_b32_e32 v90, 0
-; GFX9-NEXT:    v_mov_b32_e32 v91, 0
-; GFX9-NEXT:    v_mov_b32_e32 v92, 0
-; GFX9-NEXT:    v_mov_b32_e32 v93, 0
-; GFX9-NEXT:    v_mov_b32_e32 v94, 0
-; GFX9-NEXT:    v_mov_b32_e32 v95, 0
-; GFX9-NEXT:    v_mov_b32_e32 v96, 0
-; GFX9-NEXT:    v_mov_b32_e32 v97, 0
-; GFX9-NEXT:    v_mov_b32_e32 v98, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
@@ -550,6 +514,42 @@ define amdgpu_gfx <100 x i32> @return_100xi32() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v61, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v62, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v63, 0
+; GFX9-NEXT:    v_mov_b32_e32 v64, 0
+; GFX9-NEXT:    v_mov_b32_e32 v65, 0
+; GFX9-NEXT:    v_mov_b32_e32 v66, 0
+; GFX9-NEXT:    v_mov_b32_e32 v67, 0
+; GFX9-NEXT:    v_mov_b32_e32 v68, 0
+; GFX9-NEXT:    v_mov_b32_e32 v69, 0
+; GFX9-NEXT:    v_mov_b32_e32 v70, 0
+; GFX9-NEXT:    v_mov_b32_e32 v71, 0
+; GFX9-NEXT:    v_mov_b32_e32 v72, 0
+; GFX9-NEXT:    v_mov_b32_e32 v73, 0
+; GFX9-NEXT:    v_mov_b32_e32 v74, 0
+; GFX9-NEXT:    v_mov_b32_e32 v75, 0
+; GFX9-NEXT:    v_mov_b32_e32 v76, 0
+; GFX9-NEXT:    v_mov_b32_e32 v77, 0
+; GFX9-NEXT:    v_mov_b32_e32 v78, 0
+; GFX9-NEXT:    v_mov_b32_e32 v79, 0
+; GFX9-NEXT:    v_mov_b32_e32 v80, 0
+; GFX9-NEXT:    v_mov_b32_e32 v81, 0
+; GFX9-NEXT:    v_mov_b32_e32 v82, 0
+; GFX9-NEXT:    v_mov_b32_e32 v83, 0
+; GFX9-NEXT:    v_mov_b32_e32 v84, 0
+; GFX9-NEXT:    v_mov_b32_e32 v85, 0
+; GFX9-NEXT:    v_mov_b32_e32 v86, 0
+; GFX9-NEXT:    v_mov_b32_e32 v87, 0
+; GFX9-NEXT:    v_mov_b32_e32 v88, 0
+; GFX9-NEXT:    v_mov_b32_e32 v89, 0
+; GFX9-NEXT:    v_mov_b32_e32 v90, 0
+; GFX9-NEXT:    v_mov_b32_e32 v91, 0
+; GFX9-NEXT:    v_mov_b32_e32 v92, 0
+; GFX9-NEXT:    v_mov_b32_e32 v93, 0
+; GFX9-NEXT:    v_mov_b32_e32 v94, 0
+; GFX9-NEXT:    v_mov_b32_e32 v95, 0
+; GFX9-NEXT:    v_mov_b32_e32 v96, 0
+; GFX9-NEXT:    v_mov_b32_e32 v97, 0
+; GFX9-NEXT:    v_mov_b32_e32 v98, 0
+; GFX9-NEXT:    v_mov_b32_e32 v99, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: return_100xi32:
@@ -722,7 +722,15 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_store_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_addk_i32 s32, 0x2400
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, return_100xi32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, return_100xi32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v100, s30, 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
@@ -755,46 +763,9 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-NEXT:    buffer_store_dword v93, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v94, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v95, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-NEXT:    s_mov_b64 exec, 1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    v_writelane_b32 v0, s30, 0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-NEXT:    s_mov_b64 exec, 1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    v_writelane_b32 v0, s31, 0
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_getpc_b64 s[34:35]
-; GFX9-NEXT:    s_add_u32 s34, s34, return_100xi32 at gotpcrel32@lo+4
-; GFX9-NEXT:    s_addc_u32 s35, s35, return_100xi32 at gotpcrel32@hi+12
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v100, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-NEXT:    s_mov_b64 exec, 1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s31, v0, 0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_mov_b64 s[34:35], exec
-; GFX9-NEXT:    s_mov_b64 exec, 1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:136
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    buffer_load_dword v95, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -827,6 +798,11 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v100, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xdc00
 ; GFX9-NEXT:    s_mov_b32 s33, s36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -2439,4 +2415,1226 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind }
+; Check that return values larger than VGPR limit are handled correctly
+
+define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
+; GFX9-LABEL: return_72xi32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:160
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:284
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:280
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:152
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:276
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:272
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:268
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:264
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:136
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:260
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:256
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: return_72xi32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_clause 0x14
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:136
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:140
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:144
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:148
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:152
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:156
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:160
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:120
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:116
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:112
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:108
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:104
+; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:100
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:96
+; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:92
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:88
+; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:84
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:80
+; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:76
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:72
+; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:68
+; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:64
+; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:60
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:56
+; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:52
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:48
+; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:44
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:40
+; GFX10-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:36
+; GFX10-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:32
+; GFX10-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:28
+; GFX10-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; GFX10-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; GFX10-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; GFX10-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; GFX10-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(32)
+; GFX10-NEXT:    buffer_store_dword v48, v0, s[0:3], 0 offen offset:284
+; GFX10-NEXT:    buffer_store_dword v39, v0, s[0:3], 0 offen offset:280
+; GFX10-NEXT:    buffer_store_dword v38, v0, s[0:3], 0 offen offset:276
+; GFX10-NEXT:    buffer_store_dword v37, v0, s[0:3], 0 offen offset:272
+; GFX10-NEXT:    buffer_store_dword v36, v0, s[0:3], 0 offen offset:268
+; GFX10-NEXT:    buffer_store_dword v35, v0, s[0:3], 0 offen offset:264
+; GFX10-NEXT:    buffer_store_dword v34, v0, s[0:3], 0 offen offset:260
+; GFX10-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:256
+; GFX10-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:252
+; GFX10-NEXT:    s_waitcnt vmcnt(24)
+; GFX10-NEXT:    buffer_store_dword v40, v0, s[0:3], 0 offen offset:248
+; GFX10-NEXT:    buffer_store_dword v55, v0, s[0:3], 0 offen offset:244
+; GFX10-NEXT:    buffer_store_dword v54, v0, s[0:3], 0 offen offset:240
+; GFX10-NEXT:    buffer_store_dword v53, v0, s[0:3], 0 offen offset:236
+; GFX10-NEXT:    buffer_store_dword v52, v0, s[0:3], 0 offen offset:232
+; GFX10-NEXT:    buffer_store_dword v51, v0, s[0:3], 0 offen offset:228
+; GFX10-NEXT:    buffer_store_dword v50, v0, s[0:3], 0 offen offset:224
+; GFX10-NEXT:    buffer_store_dword v49, v0, s[0:3], 0 offen offset:220
+; GFX10-NEXT:    s_waitcnt vmcnt(16)
+; GFX10-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:216
+; GFX10-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:212
+; GFX10-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:208
+; GFX10-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:204
+; GFX10-NEXT:    buffer_store_dword v44, v0, s[0:3], 0 offen offset:200
+; GFX10-NEXT:    buffer_store_dword v43, v0, s[0:3], 0 offen offset:196
+; GFX10-NEXT:    buffer_store_dword v42, v0, s[0:3], 0 offen offset:192
+; GFX10-NEXT:    buffer_store_dword v41, v0, s[0:3], 0 offen offset:188
+; GFX10-NEXT:    s_waitcnt vmcnt(8)
+; GFX10-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:184
+; GFX10-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:180
+; GFX10-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:176
+; GFX10-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:172
+; GFX10-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:168
+; GFX10-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:164
+; GFX10-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:160
+; GFX10-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:156
+; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:152
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:148
+; GFX10-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:144
+; GFX10-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:140
+; GFX10-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:136
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:132
+; GFX10-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:128
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:124
+; GFX10-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX10-NEXT:    s_clause 0x4
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:164
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:176
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:180
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: return_72xi32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0xe
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:220
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:216
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:212
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:208
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:204
+; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:200
+; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:196
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:192
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:188
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:184
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:180
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:176
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
+; GFX11-NEXT:    s_clause 0x12
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:136
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:60
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:104
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:88
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
+; GFX11-NEXT:    s_add_i32 s34, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s35, s0, 0xb0
+; GFX11-NEXT:    s_add_i32 s36, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s37, s0, 0x90
+; GFX11-NEXT:    s_add_i32 s38, s0, 0x80
+; GFX11-NEXT:    s_add_i32 s39, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s40, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s41, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s42, s0, 64
+; GFX11-NEXT:    s_add_i32 s43, s0, 48
+; GFX11-NEXT:    s_add_i32 s44, s0, 32
+; GFX11-NEXT:    s_waitcnt vmcnt(23)
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s1
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:108
+; GFX11-NEXT:    s_waitcnt vmcnt(21)
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s1
+; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
+; GFX11-NEXT:    s_add_i32 s0, s0, 16
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s1
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s2
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s3
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    scratch_store_b128 off, v[43:46], s35
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    scratch_store_b128 off, v[39:42], s36
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s39
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s40
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s41
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s42
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s44
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0
+; GFX11-NEXT:    s_clause 0xe
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:212
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:216
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:220
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  ret <72 x i32> %val
+}
+
+define amdgpu_gfx void @call_72xi32() #1 {
+; GFX9-LABEL: call_72xi32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, s33
+; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
+; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
+; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
+; GFX9-NEXT:    s_getpc_b64 s[34:35]
+; GFX9-NEXT:    s_add_u32 s34, s34, return_72xi32 at gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s35, s35, return_72xi32 at gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x200, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 0
+; GFX9-NEXT:    v_mov_b32_e32 v22, 0
+; GFX9-NEXT:    v_mov_b32_e32 v23, 0
+; GFX9-NEXT:    v_mov_b32_e32 v24, 0
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v31, 0
+; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:648
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:652
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:656
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:660
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:664
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:668
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:672
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:676
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:680
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:684
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:688
+; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:692
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:696
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:700
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:704
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:720
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:724
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:728
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:732
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:736
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:740
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:748
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:752
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:756
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:760
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:764
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:768
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:772
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:776
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:780
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:784
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:788
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:792
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:516
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:520
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:524
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:528
+; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:532
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:536
+; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:540
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:544
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:556
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:560
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:564
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:568
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:572
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:576
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:580
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:584
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:588
+; GFX9-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:592
+; GFX9-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:596
+; GFX9-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:600
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:604
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:608
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:612
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:616
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:620
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:624
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:628
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:632
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    v_mov_b32_e32 v0, 24
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136
+; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140
+; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144
+; GFX9-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148
+; GFX9-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152
+; GFX9-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
+; GFX9-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
+; GFX9-NEXT:    v_mov_b32_e32 v2, v24
+; GFX9-NEXT:    v_mov_b32_e32 v3, v25
+; GFX9-NEXT:    v_mov_b32_e32 v4, v26
+; GFX9-NEXT:    v_mov_b32_e32 v5, v27
+; GFX9-NEXT:    v_mov_b32_e32 v6, v28
+; GFX9-NEXT:    v_mov_b32_e32 v7, v29
+; GFX9-NEXT:    v_mov_b32_e32 v8, v30
+; GFX9-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 42
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
+; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
+; GFX9-NEXT:    s_mov_b32 s33, s36
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: call_72xi32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_mov_b32 s36, s33
+; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
+; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
+; GFX10-NEXT:    s_getpc_b64 s[34:35]
+; GFX10-NEXT:    s_add_u32 s34, s34, return_72xi32 at gotpcrel32@lo+4
+; GFX10-NEXT:    s_addc_u32 s35, s35, return_72xi32 at gotpcrel32@hi+12
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:132
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:136
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:140
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:144
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:148
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:152
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 0
+; GFX10-NEXT:    v_mov_b32_e32 v7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_mov_b32_e32 v9, 0
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_mov_b32_e32 v11, 0
+; GFX10-NEXT:    v_mov_b32_e32 v12, 0
+; GFX10-NEXT:    v_mov_b32_e32 v13, 0
+; GFX10-NEXT:    v_mov_b32_e32 v14, 0
+; GFX10-NEXT:    v_mov_b32_e32 v15, 0
+; GFX10-NEXT:    v_mov_b32_e32 v16, 0
+; GFX10-NEXT:    v_mov_b32_e32 v17, 0
+; GFX10-NEXT:    v_mov_b32_e32 v18, 0
+; GFX10-NEXT:    v_mov_b32_e32 v19, 0
+; GFX10-NEXT:    v_mov_b32_e32 v20, 0
+; GFX10-NEXT:    v_mov_b32_e32 v21, 0
+; GFX10-NEXT:    v_mov_b32_e32 v22, 0
+; GFX10-NEXT:    v_mov_b32_e32 v23, 0
+; GFX10-NEXT:    v_mov_b32_e32 v24, 0
+; GFX10-NEXT:    v_mov_b32_e32 v25, 0
+; GFX10-NEXT:    v_mov_b32_e32 v26, 0
+; GFX10-NEXT:    v_mov_b32_e32 v27, 0
+; GFX10-NEXT:    v_mov_b32_e32 v28, 0
+; GFX10-NEXT:    v_mov_b32_e32 v29, 0
+; GFX10-NEXT:    v_mov_b32_e32 v30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v31, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_clause 0x28
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
+; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
+; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:648
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:652
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:656
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:660
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:664
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:668
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:672
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:676
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:680
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:684
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:688
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:692
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:720
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:724
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:728
+; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:732
+; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:736
+; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:740
+; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:748
+; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:752
+; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:756
+; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:760
+; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:764
+; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:768
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:772
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:776
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:780
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:784
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:788
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:792
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:796
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:516
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:520
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:524
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:528
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:532
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:536
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:540
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:544
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_clause 0x15
+; GFX10-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:548
+; GFX10-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:552
+; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:556
+; GFX10-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:560
+; GFX10-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:564
+; GFX10-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:568
+; GFX10-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:572
+; GFX10-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:576
+; GFX10-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:580
+; GFX10-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:584
+; GFX10-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:588
+; GFX10-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:592
+; GFX10-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:596
+; GFX10-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:600
+; GFX10-NEXT:    buffer_load_dword v24, off, s[0:3], s33 offset:604
+; GFX10-NEXT:    buffer_load_dword v25, off, s[0:3], s33 offset:608
+; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s33 offset:612
+; GFX10-NEXT:    buffer_load_dword v27, off, s[0:3], s33 offset:616
+; GFX10-NEXT:    buffer_load_dword v28, off, s[0:3], s33 offset:620
+; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s33 offset:624
+; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s33 offset:628
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:632
+; GFX10-NEXT:    v_mov_b32_e32 v0, 24
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_store_dword v9, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:12
+; GFX10-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:16
+; GFX10-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:20
+; GFX10-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:24
+; GFX10-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:132
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136
+; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140
+; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144
+; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:148
+; GFX10-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:152
+; GFX10-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:156
+; GFX10-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:160
+; GFX10-NEXT:    s_clause 0x7
+; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:1536
+; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:1540
+; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:1544
+; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:1548
+; GFX10-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:1552
+; GFX10-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:1556
+; GFX10-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:1560
+; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:1564
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT:    v_mov_b32_e32 v1, 42
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_clause 0xe
+; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20
+; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24
+; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28
+; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
+; GFX10-NEXT:    s_mov_b32 s33, s36
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: call_72xi32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s45, s33
+; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1600 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_mov_b32 s1, s0
+; GFX11-NEXT:    s_mov_b32 s2, s0
+; GFX11-NEXT:    s_mov_b32 s3, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_addk_i32 s32, 0xa00
+; GFX11-NEXT:    s_clause 0xe
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:48
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:44
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:40
+; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:36
+; GFX11-NEXT:    scratch_store_b32 off, v47, s33 offset:32
+; GFX11-NEXT:    scratch_store_b32 off, v56, s33 offset:28
+; GFX11-NEXT:    scratch_store_b32 off, v57, s33 offset:24
+; GFX11-NEXT:    scratch_store_b32 off, v58, s33 offset:20
+; GFX11-NEXT:    scratch_store_b32 off, v59, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v61, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v62, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v63, s33
+; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x90
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    scratch_store_b32 off, v4, s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x70
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x50
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    s_add_i32 s1, s32, 48
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, return_72xi32 at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, return_72xi32 at gotpcrel32@hi+12
+; GFX11-NEXT:    s_add_i32 s2, s32, 32
+; GFX11-NEXT:    s_load_b64 s[46:47], s[0:1], 0x0
+; GFX11-NEXT:    s_add_i32 s3, s32, 16
+; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0
+; GFX11-NEXT:    v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v10, 0
+; GFX11-NEXT:    v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v12, 0
+; GFX11-NEXT:    v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v14, 0
+; GFX11-NEXT:    v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v16, 0
+; GFX11-NEXT:    v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v18, 0
+; GFX11-NEXT:    v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v20, 0
+; GFX11-NEXT:    v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v22, 0
+; GFX11-NEXT:    v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v24, 0
+; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
+; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
+; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
+; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
+; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v32, v48
+; GFX11-NEXT:    s_clause 0x9
+; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
+; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
+; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:688
+; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
+; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:736
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
+; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
+; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
+; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:512
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:1584 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:528
+; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
+; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
+; GFX11-NEXT:    v_mov_b32_e32 v56, v63
+; GFX11-NEXT:    v_mov_b32_e32 v16, v19
+; GFX11-NEXT:    v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v2
+; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v15
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v15, v26
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
+; GFX11-NEXT:    v_mov_b32_e32 v32, v36
+; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
+; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51
+; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41
+; GFX11-NEXT:    v_mov_b32_e32 v50, v42
+; GFX11-NEXT:    v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v17
+; GFX11-NEXT:    v_dual_mov_b32 v17, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
+; GFX11-NEXT:    scratch_store_b32 off, v11, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
+; GFX11-NEXT:    v_mov_b32_e32 v51, v43
+; GFX11-NEXT:    v_mov_b32_e32 v41, v59
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v14
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
+; GFX11-NEXT:    v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
+; GFX11-NEXT:    v_mov_b32_e32 v5, v12
+; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
+; GFX11-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v11, v22
+; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
+; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
+; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
+; GFX11-NEXT:    v_mov_b32_e32 v13, v24
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 64
+; GFX11-NEXT:    v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 48
+; GFX11-NEXT:    v_mov_b32_e32 v16, v27
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    v_mov_b32_e32 v30, v46
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 42
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1568
+; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1552
+; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1536
+; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
+; GFX11-NEXT:    s_clause 0xe
+; GFX11-NEXT:    scratch_load_b32 v63, off, s33
+; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v61, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:12
+; GFX11-NEXT:    scratch_load_b32 v59, off, s33 offset:16
+; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:20
+; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:24
+; GFX11-NEXT:    scratch_load_b32 v56, off, s33 offset:28
+; GFX11-NEXT:    scratch_load_b32 v47, off, s33 offset:32
+; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:36
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:40
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:44
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:48
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:52
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:56
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1600 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s0
+; GFX11-NEXT:    s_addk_i32 s32, 0xf600
+; GFX11-NEXT:    s_mov_b32 s33, s45
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %ret.0 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> zeroinitializer)
+  %val.0 = insertelement <72 x i32> %ret.0, i32 42, i32 0
+  %val.1 = insertelement <72 x i32> %val.0, i32 24, i32 58
+  %ret.1 = call amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val.1)
+  ret void
+}
+
+; Ensure all VGPRs are available
+attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,1" }
+
+; Limit to 64 VGPRs
+attributes #1 = { nounwind "amdgpu-num-vgpr"="64" }


        


More information about the llvm-commits mailing list