[llvm] [AMDGPU] Folding imm offset in more cases for scratch access (PR #70634)

via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 31 07:34:45 PDT 2023


https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/70634

>From d8c8b022c5f134cd21a81f1953114a9291ea37fd Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Mon, 30 Oct 2023 15:54:13 +0800
Subject: [PATCH 1/2] [AMDGPU] Folding imm offset in more cases for scratch
 access

For scratch load/store, our hardware only accept non-negative value
in SGPR/VGPR. Besides the case that we can prove from known bits,
we can also prove that the value in `base` will be non-negative:
1.) When the ADD for the address calculatioon has NonUnsignedWrap flag.
2.) When the immediate offset is already negative.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  24 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   3 +-
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |  18 +-
 .../CodeGen/AMDGPU/flat-scratch-i8-i16.ll     |  92 +--
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      | 418 ++++++-------
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |  51 +-
 .../AMDGPU/gfx-callable-return-types.ll       | 592 +++++++++---------
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |  18 +-
 8 files changed, 565 insertions(+), 651 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..691d644badd24b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,10 +1146,23 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    // For `nuw` addition, we should not have negative base address.
+    if (Addr->getFlags().hasNoUnsignedWrap())
+      return true;
+
+    auto *RHS = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    // If the immediate offset is negative, we should not have the base being
+    // negative as well.
+    if (RHS && RHS->getSExtValue() < 0)
+      return true;
+  }
+
   // When value in 32-bit Base can be negative calculate scratch offset using
   // 32-bit add instruction, otherwise use Base(unsigned) + offset.
   return CurDAG->SignBitIsZero(Base);
@@ -1549,7 +1562,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
-        isFlatScratchBaseLegal(N0, FlatVariant)) {
+        isFlatScratchBaseLegal(Addr, N0, FlatVariant)) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1782,7 +1795,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   int64_t COffsetVal = 0;
 
   if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr.getOperand(0))) {
+      isFlatScratchBaseLegal(Addr, Addr.getOperand(0))) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {
@@ -1860,7 +1873,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
-        if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+        if (!isFlatScratchBaseLegal(Addr, SAddr))
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
@@ -1886,7 +1899,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+  if (!isFlatScratchBaseLegal(Addr, SAddr) ||
+      !isFlatScratchBaseLegal(Addr, VAddr))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..8a47757f70bbfbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+      SDValue Addr, SDValue Base,
+      uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index af023835c529776..329f0a2068cb072 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -704,11 +704,11 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
-; FLATSCR-NEXT:    v_add_u32_e32 v2, 2, v0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR-NEXT:    scratch_load_short_d16 v0, v2, off
+; FLATSCR-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -726,22 +726,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
-; FLATSCR_GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v0, v2, off
+; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; GFX11-NEXT:    scratch_load_d16_b16 v0, v2, off
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index c849cf08094e718..ad4d4a4a30fc6d0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -11,16 +11,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -38,16 +36,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT:    scratch_load_sbyte v0, v0, off
+; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_i8 v0, v0, off
+; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -65,16 +61,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT:    scratch_load_ushort v0, v0, off
+; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -92,16 +86,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT:    scratch_load_sshort v0, v0, off
+; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_i16 v0, v0, off
+; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -359,16 +351,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_ubyte v2, off, s2
+; GFX10-NEXT:    scratch_load_ubyte v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_u8 v2, off, s0
+; GFX11-NEXT:    scratch_load_u8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -386,16 +376,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_sbyte v2, off, s2
+; GFX10-NEXT:    scratch_load_sbyte v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_i8 v2, off, s0
+; GFX11-NEXT:    scratch_load_i8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -413,16 +401,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
-; GFX10-NEXT:    scratch_load_ushort v2, off, s2
+; GFX10-NEXT:    scratch_load_ushort v2, off, s2 offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
-; GFX11-NEXT:    scratch_load_u16 v2, off, s0
+; GFX11-NEXT:    scratch_load_u16 v2, off, s0 offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -440,16 +426,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
-; GFX10-NEXT:    scratch_load_sshort v2, off, s2
+; GFX10-NEXT:    scratch_load_sshort v2, off, s2 offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
-; GFX11-NEXT:    scratch_load_i16 v2, off, s0
+; GFX11-NEXT:    scratch_load_i16 v2, off, s0 offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -713,19 +697,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -745,19 +726,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_sbyte v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_i8 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -777,19 +755,16 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT:    scratch_load_ushort v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -809,19 +784,16 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT:    scratch_load_sshort v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT:    scratch_load_i16 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 07b3df2a8520aae..fe984ffa653a5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -576,11 +576,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -592,24 +591,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -628,8 +625,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -640,8 +636,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -658,24 +653,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
+; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -798,58 +791,53 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: private_ptr_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: private_ptr_foo:
 ; GFX10-PAL:       ; %bb.0:
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: private_ptr_foo:
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -1554,11 +1542,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1572,26 +1559,23 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1610,11 +1594,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -1627,8 +1610,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -1648,12 +1630,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1010-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -1672,26 +1653,23 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1030-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -2595,11 +2573,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2613,27 +2590,24 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2652,11 +2626,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -2670,8 +2643,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -2691,12 +2663,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1010-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -2715,27 +2686,24 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1030-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -2922,15 +2890,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2959,14 +2925,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
-; GFX940-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, 4
-; GFX940-NEXT:    v_mov_b32_e32 v0, 15
-; GFX940-NEXT:    scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -3019,15 +2984,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ;
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -3077,16 +3040,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-NEXT:    s_add_i32 s1, s32, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3110,15 +3070,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
-; GFX940-NEXT:    s_movk_i32 s0, 0x3000
-; GFX940-NEXT:    s_add_i32 s1, s32, 4
 ; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v0, 15
-; GFX940-NEXT:    scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3141,16 +3099,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 4
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -3168,17 +3123,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-LABEL: store_load_vidx_sidx_offset:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3189,29 +3143,26 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 4
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-NEXT:    v_add3_u32 v0, v1, v0, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vidx_sidx_offset:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, v1, v0, 0x400
-; GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -3226,13 +3177,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x400, v0
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -3243,11 +3193,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX940-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -3263,29 +3212,26 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-PAL-NEXT:    v_add3_u32 v0, v1, v0, 0x400
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_add3_u32 v0, v1, v0, 0x400
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -3657,21 +3603,20 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX10-LABEL: store_load_i32_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3697,24 +3642,34 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-PAL-LABEL: store_load_i32_negative_unaligned:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -3739,21 +3694,21 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX10-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3771,32 +3726,43 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX940-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
+; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 1
-; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-PAL-LABEL: store_load_i32_large_negative_unaligned:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 9df4b903de4bda0..e7d86c0c178e970 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,20 +1561,16 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 64
-; GFX11-NEXT:    s_add_i32 s5, s0, 48
-; GFX11-NEXT:    s_add_i32 s6, s0, 32
-; GFX11-NEXT:    s_add_i32 s7, s0, 16
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
@@ -1582,17 +1578,17 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1854,20 +1850,16 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 64
-; GFX11-NEXT:    s_add_i32 s5, s0, 48
-; GFX11-NEXT:    s_add_i32 s6, s0, 32
-; GFX11-NEXT:    s_add_i32 s7, s0, 16
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
@@ -1875,17 +1867,17 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2160,7 +2152,6 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
 ; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
 ; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
@@ -2176,7 +2167,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b32 off, v33, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 0d54da3128a617a..23502d1b36d1828 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1990,135 +1990,140 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x7d0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s3
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x790
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x780
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x770
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x760
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x750
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x740
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x730
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x720
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x710
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x700
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x690
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x680
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x670
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x660
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x650
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x640
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x630
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x620
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x610
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x600
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x590
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x580
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x570
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x560
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x550
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x540
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x530
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x520
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x510
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x500
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x490
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x480
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x470
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x460
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x450
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x440
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x430
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x420
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x410
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x400
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
@@ -2182,39 +2187,35 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x200
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x190
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x180
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x170
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x160
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x150
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x140
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x130
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x120
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
@@ -2230,20 +2231,12 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x80
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x50
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 64
-; GFX11-NEXT:    s_add_i32 s2, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 32
-; GFX11-NEXT:    s_add_i32 s0, s0, 16
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s0, s0, 48
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2643,141 +2636,122 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x10
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:220
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:216
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:212
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:208
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:204
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:200
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:196
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:192
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:188
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:184
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:180
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:176
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:172
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:168
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:164
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s32 offset:224
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s32 offset:240
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
-; GFX11-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23
-; GFX11-NEXT:    v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21
+; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:204
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:200
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:196
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:192
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:188
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:184
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:180
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:176
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
+; GFX11-NEXT:    s_clause 0x14
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    s_clause 0xd
+; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19
-; GFX11-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17
-; GFX11-NEXT:    v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15
-; GFX11-NEXT:    v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13
-; GFX11-NEXT:    v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11
-; GFX11-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9
-; GFX11-NEXT:    v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7
-; GFX11-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:104
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s38, s0, 0x80
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s42, s0, 64
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
+; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
+; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
+; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
+; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
+; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
 ; GFX11-NEXT:    s_add_i32 s43, s0, 48
-; GFX11-NEXT:    s_add_i32 s44, s0, 32
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    scratch_store_b128 off, v[45:48], s1
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:108
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s1
-; GFX11-NEXT:    s_clause 0xc
-; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v4, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s0, s0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[59:62], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s34
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s35
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s36
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s37
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s38
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s39
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[20:23], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s43
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s44
-; GFX11-NEXT:    scratch_store_b128 off, v[8:11], s0
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:220
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:212
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret <72 x i32> %val
@@ -3334,12 +3308,12 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s45, s33
+; GFX11-NEXT:    s_mov_b32 s46, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3349,8 +3323,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:60
+; GFX11-NEXT:    s_clause 0xe
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
 ; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:48
@@ -3388,7 +3361,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v32, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
@@ -3405,110 +3378,114 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s47, return_72xi32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s46, return_72xi32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; GFX11-NEXT:    s_clause 0xb
-; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:624
-; GFX11-NEXT:    scratch_load_b128 v[26:29], off, s33 offset:640
+; GFX11-NEXT:    s_mov_b32 s45, return_72xi32 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s44, return_72xi32 at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
+; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
+; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v32, v48
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
 ; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
-; GFX11-NEXT:    scratch_load_b128 v[40:43], off, s33 offset:688
-; GFX11-NEXT:    scratch_load_b128 v[44:47], off, s33 offset:704
-; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:720
-; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:736
+; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:688
+; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
+; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
+; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:736
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
 ; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
 ; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:512
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27
+; GFX11-NEXT:    v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:528
-; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:544
-; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:560
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:576
-; GFX11-NEXT:    v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29
-; GFX11-NEXT:    v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55
-; GFX11-NEXT:    v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7
-; GFX11-NEXT:    v_mov_b32_e32 v5, v8
-; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:528
+; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
+; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
+; GFX11-NEXT:    v_mov_b32_e32 v56, v63
+; GFX11-NEXT:    v_mov_b32_e32 v12, v15
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2
+; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_mov_b32_e32 v8, v15
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22
+; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_mov_b32_e32 v10, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:592
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:608
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s32
-; GFX11-NEXT:    v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53
-; GFX11-NEXT:    v_mov_b32_e32 v39, v54
-; GFX11-NEXT:    v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45
-; GFX11-NEXT:    v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v9
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
+; GFX11-NEXT:    v_mov_b32_e32 v32, v36
+; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
+; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51
+; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41
+; GFX11-NEXT:    v_mov_b32_e32 v50, v42
+; GFX11-NEXT:    v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13
+; GFX11-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
 ; GFX11-NEXT:    scratch_store_b32 off, v11, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
-; GFX11-NEXT:    v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42
-; GFX11-NEXT:    v_mov_b32_e32 v52, v43
+; GFX11-NEXT:    v_mov_b32_e32 v51, v43
+; GFX11-NEXT:    v_mov_b32_e32 v41, v59
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    v_mov_b32_e32 v42, v57
+; GFX11-NEXT:    v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12
+; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
-; GFX11-NEXT:    v_mov_b32_e32 v43, v58
-; GFX11-NEXT:    v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61
-; GFX11-NEXT:    scratch_store_b128 off, v[44:47], s0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v16
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
-; GFX11-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14
+; GFX11-NEXT:    v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    v_mov_b32_e32 v9, v16
+; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
 ; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
-; GFX11-NEXT:    v_mov_b32_e32 v11, v18
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v24
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20
+; GFX11-NEXT:    v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47
 ; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 48
-; GFX11-NEXT:    v_mov_b32_e32 v14, v21
+; GFX11-NEXT:    v_mov_b32_e32 v15, v26
 ; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    v_mov_b32_e32 v16, v23
+; GFX11-NEXT:    v_mov_b32_e32 v16, v27
 ; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s0
-; GFX11-NEXT:    v_mov_b32_e32 v29, v33
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588
+; GFX11-NEXT:    v_mov_b32_e32 v30, v46
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 42
+; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
-; GFX11-NEXT:    v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_clause 0xe
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s33 offset:8
@@ -3524,14 +3501,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:52
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:60
-; GFX11-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v32, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s45
+; GFX11-NEXT:    s_mov_b32 s33, s46
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 6f419ab2cc67eba..133b8ec6a34d071 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -208,26 +208,20 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
 ; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
 ; GCN-SCRATCH-NEXT:    v_and_b32_e32 v18, 0x3ff0, v2
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v0, v18
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v6, 16, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v10, 32, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v14, 48, v0
 ; GCN-SCRATCH-NEXT:    s_clause 0x3
 ; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[2:5], v0, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v6, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v10, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v14, off
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v0, off offset:16
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v0, off offset:32
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v0, off offset:48
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v1, v18
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v1, 16, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v18, 32, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v19, 48, v0
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v1, v[6:9], off
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v18, v[10:13], off
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v19, v[14:17], off
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[14:17], off offset:48
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

>From d676da0992942984814c1a45fb969309d4b5d2a6 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Tue, 31 Oct 2023 22:32:02 +0800
Subject: [PATCH 2/2] Address Review comments

Make isFlatScratchBaseLegal() accept two bases.
Change for GlobalIsel.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  34 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  29 +-
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   3 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 202 +++--
 .../test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll |  48 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     | 131 ++-
 .../CodeGen/AMDGPU/flat-scratch-i8-i16.ll     | 196 ++---
 llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll  | 198 ++---
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      | 102 +--
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |  87 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 104 ++-
 .../AMDGPU/gfx-callable-return-types.ll       | 743 +++++++-----------
 .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll | 534 ++++---------
 .../CodeGen/AMDGPU/schedule-addrspaces.ll     |   3 +-
 15 files changed, 923 insertions(+), 1493 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 691d644badd24b5..6bb6ebcbd511296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,26 +1146,36 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base,
+// This is used to check whether the address of scratch_load/store in the
+// form of `base1 + (base2) + immediate offset` is legal with respect to the
+// hardware's requirement that the SGPR/VGPR address offset in the flat scratch
+// instruction should be unsigned.
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base1,
+                                                SDValue Base2,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  if (Addr.getOpcode() == ISD::ADD) {
+  assert(Base1.getNode());
+  if (Addr.getOpcode() == ISD::ADD && !Base2.getNode()) {
+    assert(isa<ConstantSDNode>(Addr->getOperand(1)));
     // For `nuw` addition, we should not have negative base address.
     if (Addr->getFlags().hasNoUnsignedWrap())
       return true;
 
-    auto *RHS = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    // If the immediate offset is negative, we should not have the base being
-    // negative as well.
-    if (RHS && RHS->getSExtValue() < 0)
+    // If the immediate offset is negative, the base address cannot also be
+    // negative.
+    if (CurDAG->SignBitIsZero(Addr.getOperand(1)))
       return true;
   }
 
   // When value in 32-bit Base can be negative calculate scratch offset using
   // 32-bit add instruction, otherwise use Base(unsigned) + offset.
-  return CurDAG->SignBitIsZero(Base);
+  if (!Base2.getNode())
+    return CurDAG->SignBitIsZero(Base1);
+
+  // For the case where we have two bases, check against both.
+  return CurDAG->SignBitIsZero(Base1) && CurDAG->SignBitIsZero(Base2);
 }
 
 // TODO: If offset is too big, put low 16-bit into offset.
@@ -1562,7 +1572,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
-        isFlatScratchBaseLegal(Addr, N0, FlatVariant)) {
+        isFlatScratchBaseLegal(Addr, N0, SDValue(), FlatVariant)) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1795,7 +1805,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   int64_t COffsetVal = 0;
 
   if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr, Addr.getOperand(0))) {
+      isFlatScratchBaseLegal(Addr, Addr.getOperand(0), SDValue())) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {
@@ -1852,6 +1862,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   int64_t ImmOffset = 0;
 
   SDValue LHS, RHS;
+  SDValue OldAddr = Addr;
   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
     const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1873,7 +1884,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
-        if (!isFlatScratchBaseLegal(Addr, SAddr))
+        if (!isFlatScratchBaseLegal(Addr, SAddr, SDValue()))
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
@@ -1899,8 +1910,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(Addr, SAddr) ||
-      !isFlatScratchBaseLegal(Addr, VAddr))
+  if (!isFlatScratchBaseLegal(OldAddr, SAddr, VAddr))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8a47757f70bbfbc..21dcf3de0d72e0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue Addr, SDValue Base,
+      SDValue Addr, SDValue Base1, SDValue Base2,
       uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..bb5ef870d519364 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4095,7 +4095,10 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
   int64_t ConstOffset;
   std::tie(PtrBase, ConstOffset) =
       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
-  if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
+
+  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
+  if (ConstOffset == 0 ||
+      !isFlatScratchBaseLegal(*AddrDef->MI, PtrBase, FlatVariant))
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4257,15 +4260,16 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
 
-  if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
+  if (ConstOffset != 0 && isFlatScratchBaseLegal(*AddrDef->MI, PtrBase) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
                             SIInstrFlags::FlatScratch)) {
     Addr = PtrBase;
     ImmOffset = ConstOffset;
+    AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   }
 
-  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = AddrDef->MI->getOperand(1).getIndex();
     return {{
@@ -4352,7 +4356,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   Register LHS = AddrDef->MI->getOperand(1).getReg();
   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
 
-  if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
+  if (!KB->signBitIsZero(LHS) || !KB->signBitIsZero(RHS))
     return std::nullopt;
 
   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4486,11 +4490,26 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
   return KB->signBitIsZero(Base);
 }
 
+// This is used to check whether the address of scratch_load/store in the
+// form of `base1 + immediate offset` is legal with respect to the
+// hardware's requirement that the SGPR/VGPR address offset in the flat scratch
+// instruction should be unsigned.
 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
-    Register Base, uint64_t FlatVariant) const {
+    MachineInstr &AddrDef, Register Base, uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
+  if (AddrDef.getOpcode() == TargetOpcode::G_PTR_ADD) {
+    // For `nuw` addition, we should not have negative base address.
+    if (AddrDef.getFlag(MachineInstr::NoUWrap))
+      return true;
+    // FIXME: Add assertion this should be immediate.
+    // If the immediate offset is negative, the base address cannot also be
+    // negative.
+    if (KB->signBitIsZero(AddrDef.getOperand(1).getReg()))
+      return true;
+  }
+
   // When value in 32-bit Base can be negative calculate scratch offset using
   // 32-bit add instruction, otherwise use Base(unsigned) + offset.
   return KB->signBitIsZero(Base);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..ef9839f31cbdc20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,7 +244,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+      MachineInstr &AddrDef, Register Base,
+      uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   std::pair<Register, unsigned>
   selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 45df3bc094f351e..ec2cd43e5fb5df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -89,17 +89,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 4
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -111,42 +109,39 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add3_u32 v1, 4, v1, v2
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 4
-; GFX940-NEXT:    scratch_store_dword v1, v3, off offset:4 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v3, off offset:4 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add3_u32 v1, 4, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -233,34 +228,31 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -366,16 +358,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -387,16 +377,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    v_add3_u32 v1, 0x104, v1, v2
-; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -405,30 +394,27 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX940-NEXT:    scratch_store_dword v1, v3, off offset:260 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:260 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    v_add3_u32 v1, 0x104, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 v0, v3, off offset:260 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -631,16 +617,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4004
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x4004, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -652,16 +636,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX10-NEXT:    v_add3_u32 v1, 0x4004, v1, v2
-; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -670,32 +653,29 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
-; GFX940-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x4004
-; GFX940-NEXT:    scratch_store_dword v1, v3, s0 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; GFX940-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, s0 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x4004, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    v_add3_u32 v1, 0x4004, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 v0, v3, s0 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -945,16 +925,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x400
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX9-NEXT:    v_add3_u32 v0, v1, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -965,44 +943,40 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-NEXT:    v_add3_u32 v0, 4, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vidx_sidx_offset:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x400
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX940-NEXT:    v_add3_u32 v0, v1, v0, v2
-; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vidx_sidx_offset:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x400
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, 4, v0, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 7beaf3103586375..00eb4f4d0db1c93 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -244,33 +244,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-LABEL: amdgpu_cs_chain_spill:
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 60
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, s32
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v31, s24
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 56
-; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 52
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v30, s24
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v29, s25
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 48
-; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 44
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v28, s24
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v27, s25
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 40
-; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 36
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v26, s24
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v25, s25
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 32
-; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 28
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v24, s24
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v23, s25
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 24
-; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 20
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v22, s24
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v21, s25
-; DAGISEL-GFX11-NEXT:    s_add_i32 s24, s32, 16
-; DAGISEL-GFX11-NEXT:    s_add_i32 s25, s32, 12
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v20, s24
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v19, s25
 ; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
 ; DAGISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
 ; DAGISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
@@ -279,10 +252,23 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v36, v11 :: v_dual_mov_b32 v37, v10
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v38, v9 :: v_dual_mov_b32 v39, v8
-; DAGISEL-GFX11-NEXT:    s_add_i32 s26, s32, 8
-; DAGISEL-GFX11-NEXT:    s_add_i32 s27, s32, 4
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v18, s26
-; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v17, s27
+; DAGISEL-GFX11-NEXT:    s_clause 0xf
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v31, s32 offset:60
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v30, s32 offset:56
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v29, s32 offset:52
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v28, s32 offset:48
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v27, s32 offset:44
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v26, s32 offset:40
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v25, s32 offset:36
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v24, s32 offset:32
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v23, s32 offset:28
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v22, s32 offset:24
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v21, s32 offset:20
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v20, s32 offset:16
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v19, s32 offset:12
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v18, s32 offset:8
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v17, s32 offset:4
+; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v16, s32
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e20b..b47ef882a283d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -4535,8 +4535,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x8
-; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
 ; GFX11-NEXT:    buffer_load_b32 v32, off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
 ; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
 ; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
@@ -4545,11 +4545,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_add_i32 s4, s32, 4
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b32 off, v31, s32
+; GFX11-NEXT:    scratch_store_b32 off, v32, s32 offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b32 off, v32, s4
+; GFX11-NEXT:    scratch_store_b32 off, v31, s32
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -5546,15 +5545,16 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; GFX11-NEXT:    s_load_b512 s[4:19], s[2:3], 0x64
 ; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x24
 ; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s22, s32, 8
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s19
-; GFX11-NEXT:    s_add_i32 s19, s32, 4
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43
-; GFX11-NEXT:    scratch_store_b32 off, v0, s22
-; GFX11-NEXT:    scratch_store_b32 off, v1, s19
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v2, s32
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39
 ; GFX11-NEXT:    v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38
@@ -5572,10 +5572,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
 ; GFX11-NEXT:    v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16
 ; GFX11-NEXT:    v_mov_b32_e32 v30, s18
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -6029,14 +6025,14 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12
-; GFX11-NEXT:    v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14
-; GFX11-NEXT:    v_mov_b32_e32 v4, 15
+; GFX11-NEXT:    v_dual_mov_b32 v4, 15 :: v_dual_mov_b32 v1, 12
+; GFX11-NEXT:    v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v3, 14
+; GFX11-NEXT:    v_mov_b32_e32 v2, 13
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v4, s32 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    scratch_store_b32 off, v4, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v3, 1 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 1 :: v_dual_mov_b32 v4, 1
@@ -6368,25 +6364,25 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41700000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41300000
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41400000
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0x41500000 :: v_dual_mov_b32 v5, 1.0
 ; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41600000
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v4, s32 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    scratch_store_b32 off, v4, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0
-; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 2.0
-; GFX11-NEXT:    v_dual_mov_b32 v6, 2.0 :: v_dual_mov_b32 v9, 0x40400000
-; GFX11-NEXT:    v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v11, 0x40400000
-; GFX11-NEXT:    v_dual_mov_b32 v10, 0x40400000 :: v_dual_mov_b32 v13, 4.0
-; GFX11-NEXT:    v_dual_mov_b32 v12, 4.0 :: v_dual_mov_b32 v15, 0x40a00000
-; GFX11-NEXT:    v_dual_mov_b32 v14, 4.0 :: v_dual_mov_b32 v17, 0x40a00000
-; GFX11-NEXT:    v_mov_b32_e32 v16, 0x40a00000
+; GFX11-NEXT:    v_dual_mov_b32 v3, 1.0 :: v_dual_mov_b32 v4, 1.0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 2.0 :: v_dual_mov_b32 v6, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v9, 0x40400000 :: v_dual_mov_b32 v8, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v11, 0x40400000 :: v_dual_mov_b32 v10, 0x40400000
+; GFX11-NEXT:    v_dual_mov_b32 v13, 4.0 :: v_dual_mov_b32 v12, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v15, 0x40a00000 :: v_dual_mov_b32 v14, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v17, 0x40a00000 :: v_dual_mov_b32 v16, 0x40a00000
 ; GFX11-NEXT:    v_dual_mov_b32 v18, 0x40c00000 :: v_dual_mov_b32 v19, 0x40c00000
 ; GFX11-NEXT:    v_mov_b32_e32 v20, 0x40c00000
 ; GFX11-NEXT:    v_dual_mov_b32 v21, 0x40e00000 :: v_dual_mov_b32 v22, 0x40e00000
@@ -6735,34 +6731,32 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8
-; GFX11-NEXT:    v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10
-; GFX11-NEXT:    v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12
+; GFX11-NEXT:    v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v1, 12
+; GFX11-NEXT:    v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v3, 14
+; GFX11-NEXT:    v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v5, 8
+; GFX11-NEXT:    v_dual_mov_b32 v4, 7 :: v_dual_mov_b32 v7, 10
+; GFX11-NEXT:    v_mov_b32_e32 v6, 9
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14
-; GFX11-NEXT:    v_mov_b32_e32 v6, 13
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    s_add_i32 s1, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    scratch_store_b32 off, v8, s0
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s1
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, 1
-; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 1
-; GFX11-NEXT:    v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v9, 1
-; GFX11-NEXT:    v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v11, 2
-; GFX11-NEXT:    v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v13, 2
-; GFX11-NEXT:    v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v15, 3
-; GFX11-NEXT:    v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v17, 3
-; GFX11-NEXT:    v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v19, 3
-; GFX11-NEXT:    v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v21, 4
-; GFX11-NEXT:    v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v23, 4
-; GFX11-NEXT:    v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v25, 5
-; GFX11-NEXT:    v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v27, 5
-; GFX11-NEXT:    v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5
-; GFX11-NEXT:    v_mov_b32_e32 v28, 5
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b32 off, v8, s32 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s32
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 1 :: v_dual_mov_b32 v4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 1 :: v_dual_mov_b32 v6, 1
+; GFX11-NEXT:    v_dual_mov_b32 v9, 1 :: v_dual_mov_b32 v8, 1
+; GFX11-NEXT:    v_dual_mov_b32 v11, 2 :: v_dual_mov_b32 v10, 2
+; GFX11-NEXT:    v_dual_mov_b32 v13, 2 :: v_dual_mov_b32 v12, 2
+; GFX11-NEXT:    v_dual_mov_b32 v15, 3 :: v_dual_mov_b32 v14, 2
+; GFX11-NEXT:    v_dual_mov_b32 v17, 3 :: v_dual_mov_b32 v16, 3
+; GFX11-NEXT:    v_dual_mov_b32 v19, 3 :: v_dual_mov_b32 v18, 3
+; GFX11-NEXT:    v_dual_mov_b32 v21, 4 :: v_dual_mov_b32 v20, 4
+; GFX11-NEXT:    v_dual_mov_b32 v23, 4 :: v_dual_mov_b32 v22, 4
+; GFX11-NEXT:    v_dual_mov_b32 v25, 5 :: v_dual_mov_b32 v24, 4
+; GFX11-NEXT:    v_dual_mov_b32 v27, 5 :: v_dual_mov_b32 v26, 5
+; GFX11-NEXT:    v_dual_mov_b32 v29, 5 :: v_dual_mov_b32 v28, 5
 ; GFX11-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -7107,22 +7101,21 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40e00000
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41000000
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41100000
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41200000
 ; GFX11-NEXT:    v_mov_b32_e32 v8, 0x41700000
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41400000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41600000
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x40e00000
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41000000
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0x41100000 :: v_dual_mov_b32 v9, 1.0
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41200000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41300000
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41400000
-; GFX11-NEXT:    v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0
-; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41600000
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    s_add_i32 s1, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    scratch_store_b32 off, v8, s0
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s1
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b32 off, v8, s32 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s32
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index ad4d4a4a30fc6d0..4ee2c6e36153140 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -111,17 +111,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %i
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off
+; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -141,17 +140,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %i
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off
+; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -171,17 +169,16 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off
+; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -201,17 +198,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %i
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off
+; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off
+; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -231,17 +227,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %i
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off
+; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off
+; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -261,17 +256,16 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off
+; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off
+; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -291,17 +285,15 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspac
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off
+; GFX10-NEXT:    scratch_store_byte_d16_hi v2, v0, off offset:4
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
+; GFX11-NEXT:    scratch_store_d16_hi_b8 v2, v0, off offset:4
 ; GFX11-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
@@ -319,17 +311,15 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspa
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 2, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off
+; GFX10-NEXT:    scratch_store_short_d16_hi v2, v0, off offset:2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v2, v0, off offset:2
 ; GFX11-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
@@ -452,8 +442,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_ubyte_d16 v2, off, s2
+; GFX10-NEXT:    scratch_load_ubyte_d16 v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -461,8 +450,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_d16_u8 v2, off, s0
+; GFX11-NEXT:    scratch_load_d16_u8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -483,8 +471,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_sbyte_d16 v2, off, s2
+; GFX10-NEXT:    scratch_load_sbyte_d16 v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -492,8 +479,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_d16_i8 v2, off, s0
+; GFX11-NEXT:    scratch_load_d16_i8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -514,8 +500,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
-; GFX10-NEXT:    scratch_load_short_d16 v2, off, s2
+; GFX10-NEXT:    scratch_load_short_d16 v2, off, s2 offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -523,8 +508,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0xffff0000
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
-; GFX11-NEXT:    scratch_load_d16_b16 v2, off, s0
+; GFX11-NEXT:    scratch_load_d16_b16 v2, off, s0 offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -545,8 +529,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, -1
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_ubyte_d16_hi v2, off, s2
+; GFX10-NEXT:    scratch_load_ubyte_d16_hi v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -554,8 +537,7 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, -1
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_d16_hi_u8 v2, off, s0
+; GFX11-NEXT:    scratch_load_d16_hi_u8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -576,8 +558,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, -1
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_sbyte_d16_hi v2, off, s2
+; GFX10-NEXT:    scratch_load_sbyte_d16_hi v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -585,8 +566,7 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) in
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, -1
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_d16_hi_i8 v2, off, s0
+; GFX11-NEXT:    scratch_load_d16_hi_i8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -607,8 +587,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, -1
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
-; GFX10-NEXT:    scratch_load_short_d16_hi v2, off, s2
+; GFX10-NEXT:    scratch_load_short_d16_hi v2, off, s2 offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
@@ -616,8 +595,7 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_mov_b32_e32 v2, -1
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v2, off, s0
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v2, off, s0 offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -637,17 +615,15 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspac
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    s_add_i32 s2, s2, 4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_byte_d16_hi off, v0, s2
+; GFX10-NEXT:    scratch_store_byte_d16_hi off, v0, s2 offset:4
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b8 off, v0, s0
+; GFX11-NEXT:    scratch_store_d16_hi_b8 off, v0, s0 offset:4
 ; GFX11-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
@@ -665,17 +641,15 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspa
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_short_d16_hi off, v0, s2
+; GFX10-NEXT:    scratch_store_short_d16_hi off, v0, s2 offset:2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s0
+; GFX11-NEXT:    scratch_store_d16_hi_b16 off, v0, s0 offset:2
 ; GFX11-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
@@ -813,20 +787,18 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5)
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off
+; GFX10-NEXT:    scratch_load_ubyte_d16 v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX11-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -848,20 +820,18 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5)
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off
+; GFX10-NEXT:    scratch_load_sbyte_d16 v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX11-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -883,20 +853,18 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off
+; GFX10-NEXT:    scratch_load_short_d16 v3, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0xffff0000
+; GFX11-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -918,20 +886,18 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5)
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off
+; GFX10-NEXT:    scratch_load_ubyte_d16_hi v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -953,20 +919,18 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5)
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off
+; GFX10-NEXT:    scratch_load_sbyte_d16_hi v3, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -988,20 +952,18 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, -1
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off
+; GFX10-NEXT:    scratch_load_short_d16_hi v3, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v3
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    v_mov_b32_e32 v3, -1
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX11-NEXT:    s_endpgm
@@ -1023,20 +985,17 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
-; GFX10-NEXT:    v_add3_u32 v1, s2, v1, 4
+; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off
+; GFX10-NEXT:    scratch_store_byte_d16_hi v1, v0, off offset:4
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, s0, v1, 4
+; GFX11-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off
+; GFX11-NEXT:    scratch_store_d16_hi_b8 v1, v0, off offset:4
 ; GFX11-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
@@ -1056,20 +1015,17 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    flat_load_dword v0, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
-; GFX10-NEXT:    v_add3_u32 v1, s2, v1, 2
+; GFX10-NEXT:    v_lshl_add_u32 v1, v2, 2, s2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off
+; GFX10-NEXT:    scratch_store_short_d16_hi v1, v0, off offset:2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v1, s0, v1, 2
+; GFX11-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off
+; GFX11-NEXT:    scratch_store_d16_hi_b16 v1, v0, off offset:2
 ; GFX11-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 2e01c0064ef7c1c..10105952fc45238 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -15,19 +15,16 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, v1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -55,17 +52,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -108,19 +102,16 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -151,16 +142,13 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -204,19 +192,16 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -247,16 +232,13 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -300,20 +282,17 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, v1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -342,18 +321,16 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -397,20 +374,18 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
 ; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -440,20 +415,18 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 1, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -499,20 +472,18 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -542,20 +513,18 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 1, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -601,20 +570,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, v1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -643,18 +609,16 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -698,20 +662,18 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
 ; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
 ; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -741,20 +703,18 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 2 :: v_dual_mov_b32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 1, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -811,9 +771,8 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
 ; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v1, off offset:4 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    s_endpgm
 ;
@@ -843,20 +802,19 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v5, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v3, 1, v0
+; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 2, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v3, v1, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v5, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fe984ffa653a5ef..cd41c5b0403d9e6 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3603,20 +3603,21 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX10-LABEL: store_load_i32_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
+; GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3642,34 +3643,24 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
-; GFX1010-PAL:       ; %bb.0: ; %bb
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
-; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
-; GFX1030-PAL:       ; %bb.0: ; %bb
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
-; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -3694,21 +3685,21 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX10-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
+; GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3726,43 +3717,32 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX940-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
+; GFX940-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 1
-; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
+; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
+; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
-; GFX1010-PAL:       ; %bb.0: ; %bb
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
-; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
-; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
-; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
-; GFX1030-PAL:       ; %bb.0: ; %bb
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
-; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
-; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
-; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index e7d86c0c178e970..63164bc20dddd11 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,34 +1561,28 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1850,34 +1844,28 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:96
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:80
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:48
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2143,33 +2131,24 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
 ; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    scratch_store_b32 v0, v33, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f827a78125b7785..8b1f49ad6dd768f 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -14954,8 +14954,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s2, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -14994,7 +14992,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX11-NEXT:    s_mov_b32 s24, s40
 ; GFX11-NEXT:    s_mov_b32 s25, s41
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s2
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s32 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX11-NEXT:    s_mov_b32 s26, s42
@@ -15057,7 +15056,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15099,7 +15097,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
@@ -15390,8 +15388,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s3, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15425,19 +15421,19 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
 ; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s48
-; GFX11-NEXT:    s_add_i32 s2, s32, 24
 ; GFX11-NEXT:    s_mov_b32 s20, s36
 ; GFX11-NEXT:    s_mov_b32 s21, s37
-; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX11-NEXT:    s_mov_b32 s22, s38
+; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX11-NEXT:    s_mov_b32 s23, s39
 ; GFX11-NEXT:    s_mov_b32 s24, s40
 ; GFX11-NEXT:    s_mov_b32 s25, s41
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_store_b32 off, v6, s32 offset:24
+; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX11-NEXT:    s_mov_b32 s26, s42
-; GFX11-NEXT:    scratch_store_b32 off, v6, s2
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
 ; GFX11-NEXT:    s_mov_b32 s27, s43
 ; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
@@ -15497,7 +15493,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15528,7 +15523,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
@@ -15544,8 +15538,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s32 offset:24
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
 ; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
@@ -16156,15 +16150,15 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9
-; GFX11-NEXT:    v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11
-; GFX11-NEXT:    v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13
-; GFX11-NEXT:    v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
+; GFX11-NEXT:    v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
+; GFX11-NEXT:    v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9
+; GFX11-NEXT:    v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s32
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1
@@ -16207,19 +16201,18 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 8
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 9
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 10
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 11
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 13
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 14
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 15
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 12
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 13
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 14
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 8
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 9
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 10
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 11
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s32
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
@@ -16440,19 +16433,19 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41000000
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41100000
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41200000
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41300000
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41400000
-; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41500000
-; GFX11-NEXT:    v_mov_b32_e32 v6, 0x41600000
-; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41700000
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41500000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41700000
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41100000
+; GFX11-NEXT:    v_mov_b32_e32 v6, 0x41200000
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41300000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s32
 ; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
@@ -16497,19 +16490,18 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41000000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41100000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41200000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x41300000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0x41400000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x41500000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41600000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41700000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41500000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x41700000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x41100000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41200000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41300000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s32
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 23502d1b36d1828..1f02ac63438b1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1989,256 +1989,137 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x7
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x3b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x3a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x390
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x380
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x370
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x360
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x350
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x340
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x330
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x320
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x310
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x300
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2f0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2d0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x2b0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x2a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x290
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x280
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x270
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x260
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x250
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x240
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x230
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x220
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xd0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xb0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s0, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
+; GFX11-NEXT:    s_clause 0x3e
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2032
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2016
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:2000
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1984
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1968
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1952
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1936
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1920
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1904
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1888
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1872
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1856
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1840
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1824
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1808
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1792
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1776
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1760
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1744
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1728
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1712
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1696
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1680
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1664
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1648
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1632
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1616
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1600
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1584
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1568
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1552
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1536
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1520
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1504
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1488
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1472
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1456
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1440
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1424
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1408
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1392
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1376
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1360
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1344
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1328
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1312
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1296
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1280
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1264
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1248
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1232
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1216
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1200
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1184
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1168
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1152
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1136
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1120
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1104
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1088
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1072
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1056
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1040
+; GFX11-NEXT:    s_clause 0x3e
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1024
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:1008
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:992
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:976
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:960
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:944
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:928
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:912
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:896
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:880
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:864
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:848
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:832
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:816
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:800
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:784
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:768
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:752
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:736
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:720
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:704
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:688
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:672
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:656
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:640
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:624
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:608
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:592
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:576
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:560
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:544
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:528
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:512
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:496
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:480
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:464
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:448
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:432
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:416
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:400
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:384
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:368
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:352
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:336
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:320
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:304
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:288
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:272
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:256
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:240
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:224
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:208
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:192
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:176
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:160
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:144
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:80
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:64
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:48
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   ret <512 x i32> zeroinitializer
@@ -2636,7 +2517,6 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
@@ -2651,93 +2531,82 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
-; GFX11-NEXT:    s_clause 0x14
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x11
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:88
+; GFX11-NEXT:    scratch_store_b128 v0, v[21:24], off offset:80
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v23, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v22, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v21, off, s32 offset:104
+; GFX11-NEXT:    scratch_store_b128 v0, v[17:20], off offset:64
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 v0, v[13:16], off offset:48
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    scratch_load_b32 v15, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v14, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v13, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 v0, v[9:12], off offset:32
 ; GFX11-NEXT:    s_clause 0xd
-; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:152
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v16, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v32, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s43, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 v0, v[60:63], off offset:272
 ; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    scratch_store_b128 v0, v[12:15], off offset:256
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    scratch_store_b128 v0, v[16:19], off offset:240
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    scratch_store_b128 v0, v[20:23], off offset:224
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    scratch_store_b128 v0, v[56:59], off offset:208
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    scratch_store_b128 v0, v[41:44], off offset:192
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    scratch_store_b128 v0, v[37:40], off offset:176
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    scratch_store_b128 v0, v[52:55], off offset:160
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    scratch_store_b128 v0, v[48:51], off offset:144
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
+; GFX11-NEXT:    scratch_store_b128 v0, v[33:36], off offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-NEXT:    scratch_store_b128 v0, v[25:28], off offset:96
+; GFX11-NEXT:    scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off
 ; GFX11-NEXT:    s_clause 0xc
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
@@ -3308,7 +3177,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s46, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
@@ -3316,13 +3185,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_addk_i32 s32, 0xa00
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT:    s_addk_i32 s32, 0xa00
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x200
 ; GFX11-NEXT:    s_clause 0xe
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
@@ -3339,31 +3209,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b32 off, v61, s33 offset:8
 ; GFX11-NEXT:    scratch_store_b32 off, v62, s33 offset:4
 ; GFX11-NEXT:    scratch_store_b32 off, v63, s33
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x90
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
-; GFX11-NEXT:    scratch_store_b32 off, v4, s0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x70
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    s_add_i32 s1, s32, 0x50
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    s_add_i32 s1, s32, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    s_add_i32 s1, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    s_clause 0xa
+; GFX11-NEXT:    scratch_store_b32 off, v4, s32 offset:160
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:144
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:128
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:112
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:96
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:80
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, 0
@@ -3378,113 +3238,98 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s45, return_72xi32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s44, return_72xi32 at abs32@lo
+; GFX11-NEXT:    s_mov_b32 s1, return_72xi32 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s0, return_72xi32 at abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
-; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:640
+; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:656
+; GFX11-NEXT:    s_add_i32 s2, s33, 0x400
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_mov_b32_e32 v32, v48
-; GFX11-NEXT:    s_clause 0x9
-; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
-; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
-; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:688
-; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
-; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:736
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
-; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
-; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
-; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12
+; GFX11-NEXT:    v_dual_mov_b32 v33, v0 :: v_dual_mov_b32 v34, v1
+; GFX11-NEXT:    v_dual_mov_b32 v35, v2 :: v_dual_mov_b32 v36, v3
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:672
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_dual_mov_b32 v37, v4 :: v_dual_mov_b32 v38, v5
+; GFX11-NEXT:    v_dual_mov_b32 v39, v6 :: v_dual_mov_b32 v48, v7
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:528
-; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
-; GFX11-NEXT:    v_mov_b32_e32 v56, v63
-; GFX11-NEXT:    v_mov_b32_e32 v12, v15
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_mov_b32_e32 v10, v21
+; GFX11-NEXT:    v_dual_mov_b32 v49, v0 :: v_dual_mov_b32 v50, v1
+; GFX11-NEXT:    v_dual_mov_b32 v51, v2 :: v_dual_mov_b32 v52, v3
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:688
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
+; GFX11-NEXT:    v_dual_mov_b32 v53, v0 :: v_dual_mov_b32 v54, v1
+; GFX11-NEXT:    v_mov_b32_e32 v55, v2
+; GFX11-NEXT:    v_mov_b32_e32 v41, v3
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:704
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
+; GFX11-NEXT:    v_mov_b32_e32 v44, v2
+; GFX11-NEXT:    v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, v1
+; GFX11-NEXT:    v_mov_b32_e32 v56, v3
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:720
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
-; GFX11-NEXT:    v_mov_b32_e32 v32, v36
-; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
-; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51
-; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41
-; GFX11-NEXT:    v_mov_b32_e32 v50, v42
-; GFX11-NEXT:    v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13
-; GFX11-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
-; GFX11-NEXT:    scratch_store_b32 off, v11, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
-; GFX11-NEXT:    v_mov_b32_e32 v51, v43
-; GFX11-NEXT:    v_mov_b32_e32 v41, v59
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    v_mov_b32_e32 v7, v18
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
-; GFX11-NEXT:    v_mov_b32_e32 v5, v16
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
-; GFX11-NEXT:    v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
-; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
-; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 48
-; GFX11-NEXT:    v_mov_b32_e32 v15, v26
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    v_mov_b32_e32 v16, v27
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    v_mov_b32_e32 v30, v46
-; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
+; GFX11-NEXT:    v_dual_mov_b32 v57, v0 :: v_dual_mov_b32 v58, v1
+; GFX11-NEXT:    v_mov_b32_e32 v59, v2
+; GFX11-NEXT:    v_mov_b32_e32 v45, v3
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:736
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v1, 42
+; GFX11-NEXT:    v_dual_mov_b32 v46, v0 :: v_dual_mov_b32 v47, v1
+; GFX11-NEXT:    v_mov_b32_e32 v60, v3
+; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v62, v1 :: v_dual_mov_b32 v63, v2
+; GFX11-NEXT:    v_mov_b32_e32 v5, v3
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:768
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v61, v0 :: v_dual_mov_b32 v6, v1
+; GFX11-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3
+; GFX11-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:784
+; GFX11-NEXT:    scratch_load_b128 v[29:32], off, s33 offset:624
+; GFX11-NEXT:    scratch_load_b128 v[9:12], off, s33 offset:512
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s33 offset:1572 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    scratch_load_b128 v[9:12], off, s33 offset:528
+; GFX11-NEXT:    scratch_load_b128 v[13:16], off, s33 offset:544
+; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:560
+; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:576
+; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:592
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s33 offset:1556 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:608
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s33 offset:1540 ; 16-byte Folded Spill
+; GFX11-NEXT:    s_clause 0xa
+; GFX11-NEXT:    scratch_store_b32 off, v4, s32 offset:160
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:144
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s32 offset:128
+; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s32 offset:112
+; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s32 offset:80
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s32 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s32 offset:48
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s32 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s32 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
+; GFX11-NEXT:    scratch_store_b96 off, v[45:47], s32 offset:96
+; GFX11-NEXT:    v_mov_b32_e32 v0, 24
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1572 ; 16-byte Folded Reload
+; GFX11-NEXT:    v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v6, v10
+; GFX11-NEXT:    v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v8, v12
+; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:108
+; GFX11-NEXT:    v_dual_mov_b32 v9, v13 :: v_dual_mov_b32 v10, v14
+; GFX11-NEXT:    v_dual_mov_b32 v11, v15 :: v_dual_mov_b32 v12, v16
+; GFX11-NEXT:    v_dual_mov_b32 v13, v17 :: v_dual_mov_b32 v14, v18
+; GFX11-NEXT:    v_dual_mov_b32 v15, v19 :: v_dual_mov_b32 v16, v20
+; GFX11-NEXT:    v_dual_mov_b32 v17, v21 :: v_dual_mov_b32 v18, v22
+; GFX11-NEXT:    v_dual_mov_b32 v19, v23 :: v_dual_mov_b32 v20, v24
+; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
-; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 42
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_clause 0xe
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
@@ -3507,7 +3352,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s46
+; GFX11-NEXT:    s_mov_b32 s33, s34
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index a6ca056211fff9c..ec765cc4fccf7ac 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -2032,404 +2032,142 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
   ; DAGISEL-GFX11-NEXT:   [[COPY163:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; DAGISEL-GFX11-NEXT:   [[COPY164:%[0-9]+]]:sreg_32_xexec_hi = COPY $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[COPY165:%[0-9]+]]:vgpr_32 = COPY [[COPY131]]
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY165]], [[COPY164]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 524
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY]], killed [[S_ADD_I32_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 524, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 520
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_1]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY1]], killed [[S_ADD_I32_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 520, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 516
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_2]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY2]], killed [[S_ADD_I32_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 516, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 512
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_3]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY3]], killed [[S_ADD_I32_3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 512, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 508
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_4]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY4]], killed [[S_ADD_I32_4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 508, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 504
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_5]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY5]], killed [[S_ADD_I32_5]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 504, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 500
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_6]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY6]], killed [[S_ADD_I32_6]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 500, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 496
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_7:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_7]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY7]], killed [[S_ADD_I32_7]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 496, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 492
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_8:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_8]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY8]], killed [[S_ADD_I32_8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 492, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 488
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_9:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_9]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY9]], killed [[S_ADD_I32_9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 488, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 484
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_10:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_10]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY10]], killed [[S_ADD_I32_10]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 484, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 480
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_11:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_11]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY11]], killed [[S_ADD_I32_11]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 480, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 476
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_12]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY12]], killed [[S_ADD_I32_12]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 476, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 472
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_13]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY13]], killed [[S_ADD_I32_13]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 472, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 468
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_14]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY14]], killed [[S_ADD_I32_14]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 468, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 464
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_15]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY15]], killed [[S_ADD_I32_15]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 464, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 460
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_16]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY16]], killed [[S_ADD_I32_16]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 460, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 456
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_17]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY17]], killed [[S_ADD_I32_17]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 456, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 452
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_18]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY18]], killed [[S_ADD_I32_18]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 452, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 448
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_19]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY19]], killed [[S_ADD_I32_19]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 448, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 444
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_20]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY20]], killed [[S_ADD_I32_20]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 444, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 440
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_21]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY21]], killed [[S_ADD_I32_21]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 440, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 436
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_22]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY22]], killed [[S_ADD_I32_22]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 436, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 432
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_23]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY23]], killed [[S_ADD_I32_23]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 432, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_24:%[0-9]+]]:sreg_32 = S_MOV_B32 428
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_24]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY24]], killed [[S_ADD_I32_24]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 428, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_25:%[0-9]+]]:sreg_32 = S_MOV_B32 424
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_25:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_25]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY25]], killed [[S_ADD_I32_25]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 424, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_26:%[0-9]+]]:sreg_32 = S_MOV_B32 420
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_26:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_26]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY26]], killed [[S_ADD_I32_26]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 420, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_27:%[0-9]+]]:sreg_32 = S_MOV_B32 416
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_27:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_27]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY27]], killed [[S_ADD_I32_27]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 416, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_28:%[0-9]+]]:sreg_32 = S_MOV_B32 412
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_28:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_28]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY28]], killed [[S_ADD_I32_28]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 412, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_29:%[0-9]+]]:sreg_32 = S_MOV_B32 408
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_29:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_29]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY29]], killed [[S_ADD_I32_29]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 408, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_30:%[0-9]+]]:sreg_32 = S_MOV_B32 404
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_30:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_30]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY30]], killed [[S_ADD_I32_30]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 404, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_31:%[0-9]+]]:sreg_32 = S_MOV_B32 400
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_31:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_31]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY31]], killed [[S_ADD_I32_31]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 400, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_32:%[0-9]+]]:sreg_32 = S_MOV_B32 396
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_32:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_32]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY32]], killed [[S_ADD_I32_32]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 396, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_33:%[0-9]+]]:sreg_32 = S_MOV_B32 392
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_33:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_33]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY33]], killed [[S_ADD_I32_33]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 392, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_34:%[0-9]+]]:sreg_32 = S_MOV_B32 388
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_34:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_34]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY34]], killed [[S_ADD_I32_34]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 388, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_35:%[0-9]+]]:sreg_32 = S_MOV_B32 384
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_35:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_35]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY35]], killed [[S_ADD_I32_35]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 384, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_36:%[0-9]+]]:sreg_32 = S_MOV_B32 380
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_36:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_36]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY36]], killed [[S_ADD_I32_36]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 380, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_37:%[0-9]+]]:sreg_32 = S_MOV_B32 376
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_37:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_37]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY37]], killed [[S_ADD_I32_37]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 376, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_38:%[0-9]+]]:sreg_32 = S_MOV_B32 372
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_38:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_38]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY38]], killed [[S_ADD_I32_38]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 372, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_39:%[0-9]+]]:sreg_32 = S_MOV_B32 368
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_39:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_39]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY39]], killed [[S_ADD_I32_39]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 368, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_40:%[0-9]+]]:sreg_32 = S_MOV_B32 364
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_40:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_40]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY40]], killed [[S_ADD_I32_40]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 364, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_41:%[0-9]+]]:sreg_32 = S_MOV_B32 360
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_41:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_41]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY41]], killed [[S_ADD_I32_41]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 360, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_42:%[0-9]+]]:sreg_32 = S_MOV_B32 356
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_42:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_42]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY42]], killed [[S_ADD_I32_42]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 356, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_43:%[0-9]+]]:sreg_32 = S_MOV_B32 352
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_43:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_43]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY43]], killed [[S_ADD_I32_43]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 352, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_44:%[0-9]+]]:sreg_32 = S_MOV_B32 348
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_44:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_44]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY44]], killed [[S_ADD_I32_44]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 348, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_45:%[0-9]+]]:sreg_32 = S_MOV_B32 344
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_45:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_45]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY45]], killed [[S_ADD_I32_45]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 344, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_46:%[0-9]+]]:sreg_32 = S_MOV_B32 340
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_46:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_46]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY46]], killed [[S_ADD_I32_46]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 340, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_47:%[0-9]+]]:sreg_32 = S_MOV_B32 336
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_47:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_47]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY47]], killed [[S_ADD_I32_47]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 336, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_48:%[0-9]+]]:sreg_32 = S_MOV_B32 332
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_48:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_48]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY48]], killed [[S_ADD_I32_48]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 332, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_49:%[0-9]+]]:sreg_32 = S_MOV_B32 328
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_49:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_49]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY49]], killed [[S_ADD_I32_49]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 328, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_50:%[0-9]+]]:sreg_32 = S_MOV_B32 324
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_50:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_50]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY50]], killed [[S_ADD_I32_50]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 324, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_51:%[0-9]+]]:sreg_32 = S_MOV_B32 320
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_51:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_51]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY51]], killed [[S_ADD_I32_51]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 320, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_52:%[0-9]+]]:sreg_32 = S_MOV_B32 316
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_52:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_52]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY52]], killed [[S_ADD_I32_52]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 316, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_53:%[0-9]+]]:sreg_32 = S_MOV_B32 312
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_53:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_53]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY53]], killed [[S_ADD_I32_53]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 312, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_54:%[0-9]+]]:sreg_32 = S_MOV_B32 308
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_54:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_54]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY54]], killed [[S_ADD_I32_54]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 308, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_55:%[0-9]+]]:sreg_32 = S_MOV_B32 304
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_55:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_55]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY55]], killed [[S_ADD_I32_55]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 304, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_56:%[0-9]+]]:sreg_32 = S_MOV_B32 300
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_56:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_56]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY56]], killed [[S_ADD_I32_56]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 300, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_57:%[0-9]+]]:sreg_32 = S_MOV_B32 296
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_57:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_57]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY57]], killed [[S_ADD_I32_57]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 296, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_58:%[0-9]+]]:sreg_32 = S_MOV_B32 292
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_58:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_58]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY58]], killed [[S_ADD_I32_58]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 292, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_59:%[0-9]+]]:sreg_32 = S_MOV_B32 288
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_59:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_59]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY59]], killed [[S_ADD_I32_59]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 288, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_60:%[0-9]+]]:sreg_32 = S_MOV_B32 284
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_60:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_60]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY60]], killed [[S_ADD_I32_60]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 284, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_61:%[0-9]+]]:sreg_32 = S_MOV_B32 280
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_61:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_61]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY61]], killed [[S_ADD_I32_61]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 280, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_62:%[0-9]+]]:sreg_32 = S_MOV_B32 276
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_62:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_62]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY62]], killed [[S_ADD_I32_62]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 276, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_63:%[0-9]+]]:sreg_32 = S_MOV_B32 272
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_63:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_63]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY63]], killed [[S_ADD_I32_63]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 272, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_64:%[0-9]+]]:sreg_32 = S_MOV_B32 268
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_64:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_64]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY64]], killed [[S_ADD_I32_64]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 268, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_65:%[0-9]+]]:sreg_32 = S_MOV_B32 264
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_65:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_65]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY65]], killed [[S_ADD_I32_65]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 264, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_66:%[0-9]+]]:sreg_32 = S_MOV_B32 260
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_66:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_66]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY66]], killed [[S_ADD_I32_66]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 260, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_67:%[0-9]+]]:sreg_32 = S_MOV_B32 256
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_67:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_67]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY67]], killed [[S_ADD_I32_67]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 256, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_68:%[0-9]+]]:sreg_32 = S_MOV_B32 252
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_68:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_68]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY68]], killed [[S_ADD_I32_68]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 252, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_69:%[0-9]+]]:sreg_32 = S_MOV_B32 248
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_69:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_69]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY69]], killed [[S_ADD_I32_69]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 248, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_70:%[0-9]+]]:sreg_32 = S_MOV_B32 244
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_70:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_70]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY70]], killed [[S_ADD_I32_70]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 244, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_71:%[0-9]+]]:sreg_32 = S_MOV_B32 240
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_71:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_71]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY71]], killed [[S_ADD_I32_71]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 240, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_72:%[0-9]+]]:sreg_32 = S_MOV_B32 236
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_72:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_72]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY72]], killed [[S_ADD_I32_72]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 236, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_73:%[0-9]+]]:sreg_32 = S_MOV_B32 232
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_73:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_73]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY73]], killed [[S_ADD_I32_73]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 232, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_74:%[0-9]+]]:sreg_32 = S_MOV_B32 228
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_74:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_74]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY74]], killed [[S_ADD_I32_74]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 228, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_75:%[0-9]+]]:sreg_32 = S_MOV_B32 224
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_75:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_75]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY75]], killed [[S_ADD_I32_75]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 224, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_76:%[0-9]+]]:sreg_32 = S_MOV_B32 220
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_76:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_76]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY76]], killed [[S_ADD_I32_76]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 220, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_77:%[0-9]+]]:sreg_32 = S_MOV_B32 216
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_77:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_77]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY77]], killed [[S_ADD_I32_77]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 216, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_78:%[0-9]+]]:sreg_32 = S_MOV_B32 212
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_78:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_78]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY78]], killed [[S_ADD_I32_78]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 212, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_79:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_79:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_79]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY79]], killed [[S_ADD_I32_79]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 208, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_80:%[0-9]+]]:sreg_32 = S_MOV_B32 204
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_80:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_80]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY80]], killed [[S_ADD_I32_80]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 204, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_81:%[0-9]+]]:sreg_32 = S_MOV_B32 200
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_81:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_81]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY81]], killed [[S_ADD_I32_81]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 200, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_82:%[0-9]+]]:sreg_32 = S_MOV_B32 196
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_82:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_82]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY82]], killed [[S_ADD_I32_82]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 196, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_83:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_83:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_83]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY83]], killed [[S_ADD_I32_83]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 192, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_84:%[0-9]+]]:sreg_32 = S_MOV_B32 188
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_84:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_84]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY84]], killed [[S_ADD_I32_84]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 188, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_85:%[0-9]+]]:sreg_32 = S_MOV_B32 184
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_85:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_85]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY85]], killed [[S_ADD_I32_85]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 184, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_86:%[0-9]+]]:sreg_32 = S_MOV_B32 180
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_86:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_86]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY86]], killed [[S_ADD_I32_86]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 180, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_87:%[0-9]+]]:sreg_32 = S_MOV_B32 176
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_87:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_87]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY87]], killed [[S_ADD_I32_87]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 176, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_88:%[0-9]+]]:sreg_32 = S_MOV_B32 172
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_88:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_88]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY88]], killed [[S_ADD_I32_88]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 172, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_89:%[0-9]+]]:sreg_32 = S_MOV_B32 168
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_89:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_89]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY89]], killed [[S_ADD_I32_89]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 168, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_90:%[0-9]+]]:sreg_32 = S_MOV_B32 164
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_90:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_90]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY90]], killed [[S_ADD_I32_90]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 164, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_91:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_91:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_91]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY91]], killed [[S_ADD_I32_91]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 160, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_92:%[0-9]+]]:sreg_32 = S_MOV_B32 156
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_92:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_92]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY92]], killed [[S_ADD_I32_92]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 156, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_93:%[0-9]+]]:sreg_32 = S_MOV_B32 152
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_93:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_93]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY93]], killed [[S_ADD_I32_93]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 152, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_94:%[0-9]+]]:sreg_32 = S_MOV_B32 148
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_94:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_94]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY94]], killed [[S_ADD_I32_94]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 148, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_95:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_95:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_95]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY95]], killed [[S_ADD_I32_95]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 144, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_96:%[0-9]+]]:sreg_32 = S_MOV_B32 140
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_96:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_96]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY96]], killed [[S_ADD_I32_96]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 140, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_97:%[0-9]+]]:sreg_32 = S_MOV_B32 136
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_97:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_97]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY97]], killed [[S_ADD_I32_97]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 136, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_98:%[0-9]+]]:sreg_32 = S_MOV_B32 132
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_98:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_98]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY98]], killed [[S_ADD_I32_98]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 132, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_99:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_99:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_99]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY99]], killed [[S_ADD_I32_99]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 128, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_100:%[0-9]+]]:sreg_32 = S_MOV_B32 124
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_100:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_100]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY100]], killed [[S_ADD_I32_100]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 124, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_101:%[0-9]+]]:sreg_32 = S_MOV_B32 120
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_101:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_101]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY101]], killed [[S_ADD_I32_101]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 120, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_102:%[0-9]+]]:sreg_32 = S_MOV_B32 116
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_102:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_102]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY102]], killed [[S_ADD_I32_102]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 116, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_103:%[0-9]+]]:sreg_32 = S_MOV_B32 112
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_103:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_103]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY103]], killed [[S_ADD_I32_103]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 112, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_104:%[0-9]+]]:sreg_32 = S_MOV_B32 108
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_104:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_104]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY104]], killed [[S_ADD_I32_104]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 108, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_105:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_105:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_105]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY105]], killed [[S_ADD_I32_105]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 104, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_106:%[0-9]+]]:sreg_32 = S_MOV_B32 100
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_106:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_106]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY106]], killed [[S_ADD_I32_106]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 100, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_107:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_107:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_107]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY107]], killed [[S_ADD_I32_107]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 96, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_108:%[0-9]+]]:sreg_32 = S_MOV_B32 92
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_108:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_108]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY108]], killed [[S_ADD_I32_108]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 92, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_109:%[0-9]+]]:sreg_32 = S_MOV_B32 88
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_109:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_109]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY109]], killed [[S_ADD_I32_109]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 88, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_110:%[0-9]+]]:sreg_32 = S_MOV_B32 84
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_110:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_110]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY110]], killed [[S_ADD_I32_110]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 84, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_111:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_111:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_111]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY111]], killed [[S_ADD_I32_111]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 80, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_112:%[0-9]+]]:sreg_32 = S_MOV_B32 76
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_112:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_112]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY112]], killed [[S_ADD_I32_112]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 76, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_113:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_113:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_113]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY113]], killed [[S_ADD_I32_113]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 72, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_114:%[0-9]+]]:sreg_32 = S_MOV_B32 68
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_114:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_114]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY114]], killed [[S_ADD_I32_114]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 68, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_115:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_115:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_115]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY115]], killed [[S_ADD_I32_115]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 64, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_116:%[0-9]+]]:sreg_32 = S_MOV_B32 60
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_116:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_116]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY116]], killed [[S_ADD_I32_116]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 60, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_117:%[0-9]+]]:sreg_32 = S_MOV_B32 56
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_117:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_117]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY117]], killed [[S_ADD_I32_117]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 56, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_118:%[0-9]+]]:sreg_32 = S_MOV_B32 52
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_118:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_118]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY118]], killed [[S_ADD_I32_118]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 52, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_119:%[0-9]+]]:sreg_32 = S_MOV_B32 48
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_119:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_119]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY119]], killed [[S_ADD_I32_119]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 48, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_120:%[0-9]+]]:sreg_32 = S_MOV_B32 44
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_120:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_120]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY120]], killed [[S_ADD_I32_120]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 44, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_121:%[0-9]+]]:sreg_32 = S_MOV_B32 40
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_121:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_121]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY121]], killed [[S_ADD_I32_121]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 40, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_122:%[0-9]+]]:sreg_32 = S_MOV_B32 36
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_122:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_122]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY122]], killed [[S_ADD_I32_122]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 36, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_123:%[0-9]+]]:sreg_32 = S_MOV_B32 32
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_123:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_123]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY123]], killed [[S_ADD_I32_123]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 32, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_124:%[0-9]+]]:sreg_32 = S_MOV_B32 28
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_124:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_124]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY124]], killed [[S_ADD_I32_124]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 28, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_125:%[0-9]+]]:sreg_32 = S_MOV_B32 24
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_125:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_125]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY125]], killed [[S_ADD_I32_125]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 24, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_126:%[0-9]+]]:sreg_32 = S_MOV_B32 20
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_126:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_126]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY126]], killed [[S_ADD_I32_126]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 20, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_127:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_127:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_127]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY127]], killed [[S_ADD_I32_127]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 16, align 16, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_128:%[0-9]+]]:sreg_32 = S_MOV_B32 12
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_128:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_128]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[COPY166:%[0-9]+]]:vgpr_32 = COPY [[COPY128]]
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY166]], killed [[S_ADD_I32_128]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 12, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_129:%[0-9]+]]:sreg_32 = S_MOV_B32 8
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_129:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_129]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[COPY167:%[0-9]+]]:vgpr_32 = COPY [[COPY129]]
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY167]], killed [[S_ADD_I32_129]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 8, align 8, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[S_MOV_B32_130:%[0-9]+]]:sreg_32 = S_MOV_B32 4
-  ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_130:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_130]], implicit-def dead $scc
-  ; DAGISEL-GFX11-NEXT:   [[COPY168:%[0-9]+]]:vgpr_32 = COPY [[COPY130]]
-  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY168]], killed [[S_ADD_I32_130]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 4, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY]], [[COPY164]], 524, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 524, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY1]], [[COPY164]], 520, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 520, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY2]], [[COPY164]], 516, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 516, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY3]], [[COPY164]], 512, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 512, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY4]], [[COPY164]], 508, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 508, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY5]], [[COPY164]], 504, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 504, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY6]], [[COPY164]], 500, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 500, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY7]], [[COPY164]], 496, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 496, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY8]], [[COPY164]], 492, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 492, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY9]], [[COPY164]], 488, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 488, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY10]], [[COPY164]], 484, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 484, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY11]], [[COPY164]], 480, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 480, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY12]], [[COPY164]], 476, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 476, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY13]], [[COPY164]], 472, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 472, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY14]], [[COPY164]], 468, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 468, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY15]], [[COPY164]], 464, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 464, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY16]], [[COPY164]], 460, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 460, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY17]], [[COPY164]], 456, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 456, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY18]], [[COPY164]], 452, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 452, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY19]], [[COPY164]], 448, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 448, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY20]], [[COPY164]], 444, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 444, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY21]], [[COPY164]], 440, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 440, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY22]], [[COPY164]], 436, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 436, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY23]], [[COPY164]], 432, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 432, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY24]], [[COPY164]], 428, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 428, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY25]], [[COPY164]], 424, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 424, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY26]], [[COPY164]], 420, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 420, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY27]], [[COPY164]], 416, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 416, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY28]], [[COPY164]], 412, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 412, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY29]], [[COPY164]], 408, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 408, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY30]], [[COPY164]], 404, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 404, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY31]], [[COPY164]], 400, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 400, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY32]], [[COPY164]], 396, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 396, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY33]], [[COPY164]], 392, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 392, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY34]], [[COPY164]], 388, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 388, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY35]], [[COPY164]], 384, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 384, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY36]], [[COPY164]], 380, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 380, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY37]], [[COPY164]], 376, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 376, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY38]], [[COPY164]], 372, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 372, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY39]], [[COPY164]], 368, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 368, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY40]], [[COPY164]], 364, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 364, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY41]], [[COPY164]], 360, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 360, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY42]], [[COPY164]], 356, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 356, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY43]], [[COPY164]], 352, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 352, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY44]], [[COPY164]], 348, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 348, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY45]], [[COPY164]], 344, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 344, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY46]], [[COPY164]], 340, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 340, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY47]], [[COPY164]], 336, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 336, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY48]], [[COPY164]], 332, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 332, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY49]], [[COPY164]], 328, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 328, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY50]], [[COPY164]], 324, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 324, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY51]], [[COPY164]], 320, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 320, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY52]], [[COPY164]], 316, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 316, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY53]], [[COPY164]], 312, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 312, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY54]], [[COPY164]], 308, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 308, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY55]], [[COPY164]], 304, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 304, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY56]], [[COPY164]], 300, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 300, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY57]], [[COPY164]], 296, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 296, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY58]], [[COPY164]], 292, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 292, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY59]], [[COPY164]], 288, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 288, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY60]], [[COPY164]], 284, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 284, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY61]], [[COPY164]], 280, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 280, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY62]], [[COPY164]], 276, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 276, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY63]], [[COPY164]], 272, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 272, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY64]], [[COPY164]], 268, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 268, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY65]], [[COPY164]], 264, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 264, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY66]], [[COPY164]], 260, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 260, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY67]], [[COPY164]], 256, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 256, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY68]], [[COPY164]], 252, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 252, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY69]], [[COPY164]], 248, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 248, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY70]], [[COPY164]], 244, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 244, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY71]], [[COPY164]], 240, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 240, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY72]], [[COPY164]], 236, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 236, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY73]], [[COPY164]], 232, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 232, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY74]], [[COPY164]], 228, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 228, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY75]], [[COPY164]], 224, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 224, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY76]], [[COPY164]], 220, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 220, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY77]], [[COPY164]], 216, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 216, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY78]], [[COPY164]], 212, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 212, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY79]], [[COPY164]], 208, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 208, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY80]], [[COPY164]], 204, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 204, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY81]], [[COPY164]], 200, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 200, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY82]], [[COPY164]], 196, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 196, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY83]], [[COPY164]], 192, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 192, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY84]], [[COPY164]], 188, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 188, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY85]], [[COPY164]], 184, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 184, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY86]], [[COPY164]], 180, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 180, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY87]], [[COPY164]], 176, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 176, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY88]], [[COPY164]], 172, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 172, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY89]], [[COPY164]], 168, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 168, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY90]], [[COPY164]], 164, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 164, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY91]], [[COPY164]], 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 160, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY92]], [[COPY164]], 156, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 156, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY93]], [[COPY164]], 152, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 152, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY94]], [[COPY164]], 148, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 148, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY95]], [[COPY164]], 144, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 144, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY96]], [[COPY164]], 140, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 140, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY97]], [[COPY164]], 136, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 136, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY98]], [[COPY164]], 132, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 132, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY99]], [[COPY164]], 128, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 128, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY100]], [[COPY164]], 124, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 124, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY101]], [[COPY164]], 120, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 120, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY102]], [[COPY164]], 116, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 116, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY103]], [[COPY164]], 112, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 112, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY104]], [[COPY164]], 108, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 108, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY105]], [[COPY164]], 104, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 104, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY106]], [[COPY164]], 100, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 100, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY107]], [[COPY164]], 96, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 96, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY108]], [[COPY164]], 92, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 92, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY109]], [[COPY164]], 88, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 88, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY110]], [[COPY164]], 84, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 84, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY111]], [[COPY164]], 80, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 80, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY112]], [[COPY164]], 76, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 76, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY113]], [[COPY164]], 72, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 72, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY114]], [[COPY164]], 68, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 68, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY115]], [[COPY164]], 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 64, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY116]], [[COPY164]], 60, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 60, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY117]], [[COPY164]], 56, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 56, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY118]], [[COPY164]], 52, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 52, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY119]], [[COPY164]], 48, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 48, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY120]], [[COPY164]], 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 44, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY121]], [[COPY164]], 40, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 40, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY122]], [[COPY164]], 36, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 36, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY123]], [[COPY164]], 32, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 32, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY124]], [[COPY164]], 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 28, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY125]], [[COPY164]], 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 24, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY126]], [[COPY164]], 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 20, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY127]], [[COPY164]], 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 16, align 16, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   [[COPY165:%[0-9]+]]:vgpr_32 = COPY [[COPY128]]
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY165]], [[COPY164]], 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 12, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   [[COPY166:%[0-9]+]]:vgpr_32 = COPY [[COPY129]]
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY166]], [[COPY164]], 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 8, align 8, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   [[COPY167:%[0-9]+]]:vgpr_32 = COPY [[COPY130]]
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY167]], [[COPY164]], 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 4, addrspace 5)
+  ; DAGISEL-GFX11-NEXT:   [[COPY168:%[0-9]+]]:vgpr_32 = COPY [[COPY131]]
+  ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY168]], [[COPY164]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack, align 16, addrspace 5)
   ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY163]]
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
index 29c82db6f8204ed..65bdba274ee2bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
@@ -5,10 +5,9 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3
 ; CHECK-LABEL: example:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_add_nc_u32_e32 v3, 4, v0
 ; CHECK-NEXT:    s_clause 0x1
 ; CHECK-NEXT:    scratch_load_b32 v2, v0, off
-; CHECK-NEXT:    scratch_load_b32 v3, v3, off
+; CHECK-NEXT:    scratch_load_b32 v3, v0, off offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_b64 v[2:3], v1, s[4:7], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]



More information about the llvm-commits mailing list