[llvm] 5e56d59 - Fix SGPR + offset Scratch offset folding

Petar Avramovic via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 9 01:55:08 PST 2023


Author: Petar Avramovic
Date: 2023-03-09T10:52:44+01:00
New Revision: 5e56d5999914538c8060e4fd9b23ec19650eebce

URL: https://github.com/llvm/llvm-project/commit/5e56d5999914538c8060e4fd9b23ec19650eebce
DIFF: https://github.com/llvm/llvm-project/commit/5e56d5999914538c8060e4fd9b23ec19650eebce.diff

LOG: Fix SGPR + offset Scratch offset folding

Values in SGPR register are treated as unsigned by hardware.

When value in 32-bit SGPR base can be negative calculate offset using
32-bit add instruction, otherwise use sgpr base(unsigned) + offset.
Does not affect case where whole offset comes from SGPR register
(immediate offset is 0).

LoopStrengthReduce.cpp changes offsets to negative and in some
iterations value in SGPR register could be negative.

Differential Revision: https://reviews.llvm.org/D144955

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 28c26b2998f00..54cb8e59a8498 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1135,6 +1135,15 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+                                                uint64_t FlatVariant) const {
+  if (FlatVariant != SIInstrFlags::FlatScratch)
+    return true;
+  // When value in 32-bit Base can be negative calculate scratch offset using
+  // 32-bit add instruction, otherwise use Base(unsigned) + offset.
+  return CurDAG->SignBitIsZero(Base);
+}
+
 // TODO: If offset is too big, put low 16-bit into offset.
 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
                                                    SDValue &Offset0,
@@ -1760,7 +1769,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
 
   int64_t COffsetVal = 0;
 
-  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+  if (CurDAG->isBaseWithConstantOffset(Addr) &&
+      isFlatScratchBaseLegal(Addr.getOperand(0))) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 12912b77edaf0..ac5e9d0986011 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -158,6 +158,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
+  bool isFlatScratchBaseLegal(
+      SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
                                  SDValue &Offset1) const;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7d3536df7a0a0..088843779a893 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4010,7 +4010,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
 
-  if (ConstOffset != 0 &&
+  if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
                             SIInstrFlags::FlatScratch)) {
     Addr = PtrBase;
@@ -4234,6 +4234,16 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
   return KnownBits->signBitIsZero(Base);
 }
 
+bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
+    Register Base, uint64_t FlatVariant) const {
+  if (FlatVariant != SIInstrFlags::FlatScratch)
+    return true;
+
+  // When value in 32-bit Base can be negative calculate scratch offset using
+  // 32-bit add instruction, otherwise use Base(unsigned) + offset.
+  return KnownBits->signBitIsZero(Base);
+}
+
 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
                                                     unsigned ShAmtBits) const {
   assert(MI.getOpcode() == TargetOpcode::G_AND);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 0ccf02ba41cfd..9f1376e6d0288 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 
+#include "SIDefines.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/IR/InstrTypes.h"
 
@@ -236,6 +237,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool isDSOffsetLegal(Register Base, int64_t Offset) const;
   bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
                         unsigned Size) const;
+  bool isFlatScratchBaseLegal(
+      Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   std::pair<Register, unsigned>
   selectDS1Addr1OffsetImpl(MachineOperand &Root) const;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 28a7b1a62a708..cf01ff4f90926 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -33,20 +33,24 @@ define amdgpu_kernel void @kernel_caller_stack() {
 ;
 ; FLATSCR-LABEL: kernel_caller_stack:
 ; FLATSCR:       ; %bb.0:
-; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_mov_b32 s32, 0
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 4
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 10
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 12
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 11
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:12
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_add_u32 s2, s32, 16
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 12
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:16
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s2
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    s_endpgm
   call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
@@ -155,42 +159,40 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s8, 0
+; FLATSCR-NEXT:    s_mov_b32 s10, 0
+; FLATSCR-NEXT:    s_mov_b32 s9, 0
 ; FLATSCR-NEXT:    s_mov_b32 vcc_lo, 0
 ; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
 ; FLATSCR-NEXT:    s_mov_b32 s11, 0
-; FLATSCR-NEXT:    s_mov_b32 s10, 0
-; FLATSCR-NEXT:    s_mov_b32 s9, 0
-; FLATSCR-NEXT:    s_mov_b32 s8, 0
 ; FLATSCR-NEXT:    s_mov_b32 s7, 0
-; FLATSCR-NEXT:    s_mov_b32 s6, 0
 ; FLATSCR-NEXT:    s_mov_b32 s5, 0
+; FLATSCR-NEXT:    s_mov_b32 s3, 0
 ; FLATSCR-NEXT:    s_mov_b32 s1, 0
 ; FLATSCR-NEXT:    s_mov_b32 s0, 0
-; FLATSCR-NEXT:    s_mov_b32 s4, 0
-; FLATSCR-NEXT:    s_mov_b32 s3, 0
 ; FLATSCR-NEXT:    s_mov_b32 s2, 0
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_lo offset:8
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s11 offset:24
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s10 offset:32
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s9 offset:40
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s8 offset:48
+; FLATSCR-NEXT:    s_mov_b32 s4, 0
+; FLATSCR-NEXT:    s_mov_b32 s6, 0
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s8 offset:8
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s10 offset:16
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s9 offset:24
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_lo offset:32
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s11 offset:48
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s7 offset:56
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s6 offset:64
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s5 offset:72
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s5 offset:64
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s3 offset:72
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s1 offset:80
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:88
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s4 offset:96
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s3 offset:104
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s2 offset:112
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_lo offset:120
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:128
-; FLATSCR-NEXT:    s_mov_b32 s40, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s40 offset:8
-; FLATSCR-NEXT:    s_mov_b32 s39, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s39 offset:16
-; FLATSCR-NEXT:    s_mov_b32 s38, 0
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s38 offset:24
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s2 offset:96
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s4 offset:104
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s6 offset:112
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s8 offset:120
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s10 offset:128
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s9 offset:8
+; FLATSCR-NEXT:    s_nop 0
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, vcc_lo offset:16
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, vcc_hi offset:24
 ; FLATSCR-NEXT:    s_mov_b32 s37, 0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s37 offset:32
 ; FLATSCR-NEXT:    s_mov_b32 s36, 0
@@ -205,22 +207,29 @@ define amdgpu_kernel void @kernel_caller_byval() {
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval at rel32@hi+12
+; FLATSCR-NEXT:    s_add_u32 s2, s32, 8
+; FLATSCR-NEXT:    s_add_u32 s3, s32, 16
+; FLATSCR-NEXT:    s_add_u32 s4, s32, 24
+; FLATSCR-NEXT:    s_add_u32 s5, s32, 32
+; FLATSCR-NEXT:    s_add_u32 s6, s32, 40
+; FLATSCR-NEXT:    s_add_u32 s7, s32, 48
+; FLATSCR-NEXT:    s_add_u32 s8, s32, 56
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:24
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[6:7], s4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:32
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[8:9], s5
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:40
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[10:11], s6
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:48
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[12:13], s7
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:56
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[14:15], s8
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    s_endpgm
   %alloca = alloca [16 x i32], align 4, addrspace(5)
@@ -277,17 +286,21 @@ define void @func_caller_stack() {
 ; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 4
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 10
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 12
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 11
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:12
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 16
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 12
-; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 0
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:16
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
 ; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
 ; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32 at rel32@lo+4
 ; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32 at rel32@hi+12
@@ -403,35 +416,42 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
-; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
 ; FLATSCR-NEXT:    v_writelane_b32 v41, s0, 0
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    s_add_u32 s2, s32, 56
 ; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
-; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
-; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4
-; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval at rel32@hi+12
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:8
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:8
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:16
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 16
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:16
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:24
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 24
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:24
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:32
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:32
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:40
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 40
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:40
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:48
+; FLATSCR-NEXT:    s_add_u32 s0, s32, 48
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:48
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], v0, off offset:56
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval at rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval at rel32@hi+12
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56
+; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s2
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index 499ee9acbbf34..0cd62c9aeb258 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -353,14 +353,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_ubyte v2, off, s2 offset:1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    scratch_load_ubyte v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_u8 v2, off, s0 offset:1
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    scratch_load_u8 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -379,14 +381,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_sbyte v2, off, s2 offset:1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    scratch_load_sbyte v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_i8 v2, off, s0 offset:1
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    scratch_load_i8 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -405,14 +409,16 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_ushort v2, off, s2 offset:2
+; GFX10-NEXT:    s_add_i32 s2, s2, 2
+; GFX10-NEXT:    scratch_load_ushort v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_u16 v2, off, s0 offset:2
+; GFX11-NEXT:    s_add_i32 s0, s0, 2
+; GFX11-NEXT:    scratch_load_u16 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -431,14 +437,16 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    scratch_load_sshort v2, off, s2 offset:2
+; GFX10-NEXT:    s_add_i32 s2, s2, 2
+; GFX10-NEXT:    scratch_load_sshort v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    scratch_load_i16 v2, off, s0 offset:2
+; GFX11-NEXT:    s_add_i32 s0, s0, 2
+; GFX11-NEXT:    scratch_load_i16 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -458,7 +466,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX10-NEXT:    scratch_load_ubyte_d16 v2, off, s2 offset:1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    scratch_load_ubyte_d16 v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -466,7 +475,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX11-NEXT:    scratch_load_d16_u8 v2, off, s0 offset:1
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    scratch_load_d16_u8 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -488,7 +498,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX10-NEXT:    scratch_load_sbyte_d16 v2, off, s2 offset:1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    scratch_load_sbyte_d16 v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -496,7 +507,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX11-NEXT:    scratch_load_d16_i8 v2, off, s0 offset:1
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    scratch_load_d16_i8 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -518,7 +530,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX10-NEXT:    scratch_load_short_d16 v2, off, s2 offset:2
+; GFX10-NEXT:    s_add_i32 s2, s2, 2
+; GFX10-NEXT:    scratch_load_short_d16 v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -526,7 +539,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX11-NEXT:    scratch_load_d16_b16 v2, off, s0 offset:2
+; GFX11-NEXT:    s_add_i32 s0, s0, 2
+; GFX11-NEXT:    scratch_load_d16_b16 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -548,7 +562,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, -1
-; GFX10-NEXT:    scratch_load_ubyte_d16_hi v2, off, s2 offset:1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    scratch_load_ubyte_d16_hi v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -556,7 +571,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, -1
-; GFX11-NEXT:    scratch_load_d16_hi_u8 v2, off, s0 offset:1
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    scratch_load_d16_hi_u8 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -578,7 +594,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, -1
-; GFX10-NEXT:    scratch_load_sbyte_d16_hi v2, off, s2 offset:1
+; GFX10-NEXT:    s_add_i32 s2, s2, 1
+; GFX10-NEXT:    scratch_load_sbyte_d16_hi v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -586,7 +603,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, -1
-; GFX11-NEXT:    scratch_load_d16_hi_i8 v2, off, s0 offset:1
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    scratch_load_d16_hi_i8 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -608,7 +626,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, -1
-; GFX10-NEXT:    scratch_load_short_d16_hi v2, off, s2 offset:2
+; GFX10-NEXT:    s_add_i32 s2, s2, 2
+; GFX10-NEXT:    scratch_load_short_d16_hi v2, off, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -616,7 +635,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, -1
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v2, off, s0 offset:2
+; GFX11-NEXT:    s_add_i32 s0, s0, 2
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v2, off, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -637,15 +657,17 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspac
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-NEXT:    s_add_i32 s2, s2, 4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_byte_d16_hi off, v0, s2 offset:4
+; GFX10-NEXT:    scratch_store_byte_d16_hi off, v0, s2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX11-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b8 off, v0, s0 offset:4
+; GFX11-NEXT:    scratch_store_d16_hi_b8 off, v0, s0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -664,15 +686,17 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspa
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
+; GFX10-NEXT:    s_add_i32 s2, s2, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_short_d16_hi off, v0, s2 offset:2
+; GFX10-NEXT:    scratch_store_short_d16_hi off, v0, s2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
+; GFX11-NEXT:    s_add_i32 s0, s0, 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s0 offset:2
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 bb:

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 8da0dc3c0e0ee..7f481b2a134e2 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -13845,6 +13845,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s2, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
@@ -13883,8 +13885,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX11-NEXT:    s_mov_b32 s24, s40
 ; GFX11-NEXT:    s_mov_b32 s25, s41
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s32 offset:16
+; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s2
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX11-NEXT:    s_mov_b32 s26, s42
@@ -13952,6 +13953,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
@@ -13993,7 +13995,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
@@ -14321,6 +14323,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg at rel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg at rel32@hi+12
+; GFX11-NEXT:    s_add_i32 s3, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
@@ -14331,19 +14334,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
 ; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s48
+; GFX11-NEXT:    s_add_i32 s2, s32, 24
 ; GFX11-NEXT:    s_mov_b32 s20, s36
 ; GFX11-NEXT:    s_mov_b32 s21, s37
-; GFX11-NEXT:    s_mov_b32 s22, s38
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
+; GFX11-NEXT:    s_mov_b32 s22, s38
 ; GFX11-NEXT:    s_mov_b32 s23, s39
 ; GFX11-NEXT:    s_mov_b32 s24, s40
 ; GFX11-NEXT:    s_mov_b32 s25, s41
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_store_b32 off, v6, s32 offset:24
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s32 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX11-NEXT:    s_mov_b32 s26, s42
+; GFX11-NEXT:    scratch_store_b32 off, v6, s2
+; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s3
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
 ; GFX11-NEXT:    s_mov_b32 s27, s43
 ; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
@@ -14433,11 +14436,13 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
@@ -14453,8 +14458,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s32 offset:24
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s2
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
@@ -15115,15 +15120,16 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
-; GFX11-NEXT:    v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
-; GFX11-NEXT:    v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9
-; GFX11-NEXT:    v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11
+; GFX11-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9
+; GFX11-NEXT:    v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11
+; GFX11-NEXT:    v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13
+; GFX11-NEXT:    v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1
@@ -15140,7 +15146,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v27, 5
 ; GFX11-NEXT:    v_dual_mov_b32 v28, 5 :: v_dual_mov_b32 v29, 5
 ; GFX11-NEXT:    v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7
-; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
@@ -15171,18 +15176,20 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 13
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 14
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 8
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 9
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 10
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 11
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 8
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 9
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 10
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 11
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 12
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 13
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 14
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 15
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s32
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
@@ -15215,7 +15222,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
@@ -15417,19 +15423,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41400000
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41500000
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41600000
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41700000
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41100000
-; GFX11-NEXT:    v_mov_b32_e32 v6, 0x41200000
-; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41300000
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41100000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41300000
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41400000
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41500000
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0x41600000
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41700000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
@@ -15448,7 +15455,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX11-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX11-NEXT:    v_mov_b32_e32 v31, 0x40e00000
-; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
@@ -15479,18 +15485,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41400000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41500000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41600000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x41700000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0x41000000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x41100000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41200000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41300000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41100000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x41300000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0x41400000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x41500000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41600000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41700000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s32
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
@@ -15523,7 +15531,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 0x40e00000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v41, s0, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index a67a44971b647..ea4702bf0f27d 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1497,137 +1497,263 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    s_clause 0x3e
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2032
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2016
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2000
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1984
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1968
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1952
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1936
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1920
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1904
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1888
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1872
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1856
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1840
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1824
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1808
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1792
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1776
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1760
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1744
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1728
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1712
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1696
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1680
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1664
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1648
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1632
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1616
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1600
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1584
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1568
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1552
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1536
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1520
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1504
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1488
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1472
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1456
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1440
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1424
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1408
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1392
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1376
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1360
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1344
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1328
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1312
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1296
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1280
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1264
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1248
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1232
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1216
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1200
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1184
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1168
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1152
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1136
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1120
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1104
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1088
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1072
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1056
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1040
-; GFX11-NEXT:    s_clause 0x3e
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1024
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1008
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:992
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:976
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:960
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:944
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:928
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:912
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:896
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:880
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:864
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:848
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:832
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:816
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:800
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:784
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:768
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:752
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:736
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:720
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:704
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:688
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:672
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:656
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:640
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:624
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:608
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:592
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:576
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:560
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:544
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:528
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:512
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:496
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:480
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:464
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:448
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:432
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:416
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:400
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:384
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:368
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:352
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:336
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:320
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:304
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:288
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:272
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:256
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:224
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:208
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:192
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:176
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:160
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:144
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:128
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:80
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:64
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:16
-; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_add_i32 s3, s0, 0x7d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s3
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x7b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x790
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x780
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x770
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x760
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x750
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x740
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x730
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x720
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x710
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x700
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6f0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6e0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x690
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x680
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x670
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x660
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x650
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x640
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x630
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x620
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x610
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x600
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5f0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5e0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x590
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x580
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x570
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x560
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x550
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x540
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x530
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x520
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x510
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x500
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4f0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4e0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x490
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x480
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x470
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x460
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x450
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x440
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x430
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x420
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x410
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x400
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x3e0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x3d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x3c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x3b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x3a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x390
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x380
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x370
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x360
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x350
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x340
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x330
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x320
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x310
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x300
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x2f0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x2e0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x2d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x2c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x2b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x2a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x290
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x280
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x270
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x260
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x250
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x240
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x230
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x220
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x200
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1f0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1e0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1d0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1c0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1b0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1a0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x190
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x180
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x170
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x160
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x150
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x140
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x130
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x120
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x110
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xd0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xb0
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x80
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x70
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x50
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 64
+; GFX11-NEXT:    s_add_i32 s2, s0, 48
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
+; GFX11-NEXT:    s_add_i32 s1, s0, 32
+; GFX11-NEXT:    s_add_i32 s0, s0, 16
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:


        


More information about the llvm-commits mailing list