[llvm] [AMDGPU] Folding imm offset in more cases for scratch access (PR #70634)

via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 7 19:51:57 PST 2023


https://github.com/ruiling updated https://github.com/llvm/llvm-project/pull/70634

>From d8c8b022c5f134cd21a81f1953114a9291ea37fd Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Mon, 30 Oct 2023 15:54:13 +0800
Subject: [PATCH 1/7] [AMDGPU] Folding imm offset in more cases for scratch
 access

For scratch load/store, our hardware only accept non-negative value
in SGPR/VGPR. Besides the case that we can prove from known bits,
we can also prove that the value in `base` will be non-negative:
1.) When the ADD for the address calculatioon has NonUnsignedWrap flag.
2.) When the immediate offset is already negative.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  24 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   3 +-
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |  18 +-
 .../CodeGen/AMDGPU/flat-scratch-i8-i16.ll     |  92 +--
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      | 418 ++++++-------
 llvm/test/CodeGen/AMDGPU/function-returns.ll  |  51 +-
 .../AMDGPU/gfx-callable-return-types.ll       | 592 +++++++++---------
 llvm/test/CodeGen/AMDGPU/memory_clause.ll     |  18 +-
 8 files changed, 565 insertions(+), 651 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..691d644badd24b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,10 +1146,23 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    // For `nuw` addition, we should not have negative base address.
+    if (Addr->getFlags().hasNoUnsignedWrap())
+      return true;
+
+    auto *RHS = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    // If the immediate offset is negative, we should not have the base being
+    // negative as well.
+    if (RHS && RHS->getSExtValue() < 0)
+      return true;
+  }
+
   // When value in 32-bit Base can be negative calculate scratch offset using
   // 32-bit add instruction, otherwise use Base(unsigned) + offset.
   return CurDAG->SignBitIsZero(Base);
@@ -1549,7 +1562,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
-        isFlatScratchBaseLegal(N0, FlatVariant)) {
+        isFlatScratchBaseLegal(Addr, N0, FlatVariant)) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1782,7 +1795,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   int64_t COffsetVal = 0;
 
   if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr.getOperand(0))) {
+      isFlatScratchBaseLegal(Addr, Addr.getOperand(0))) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {
@@ -1860,7 +1873,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
-        if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+        if (!isFlatScratchBaseLegal(Addr, SAddr))
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
@@ -1886,7 +1899,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+  if (!isFlatScratchBaseLegal(Addr, SAddr) ||
+      !isFlatScratchBaseLegal(Addr, VAddr))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..8a47757f70bbfbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+      SDValue Addr, SDValue Base,
+      uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index af023835c529776..329f0a2068cb072 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -704,11 +704,11 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
-; FLATSCR-NEXT:    v_add_u32_e32 v2, 2, v0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR-NEXT:    scratch_load_short_d16 v0, v2, off
+; FLATSCR-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -726,22 +726,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
-; FLATSCR_GFX10-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v0, v2, off
+; FLATSCR_GFX10-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-NEXT:    v_add_nc_u32_e32 v2, 2, v0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; GFX11-NEXT:    scratch_load_d16_b16 v0, v2, off
+; GFX11-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index c849cf08094e718..ad4d4a4a30fc6d0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -11,16 +11,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -38,16 +36,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT:    scratch_load_sbyte v0, v0, off
+; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT:    scratch_load_i8 v0, v0, off
+; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -65,16 +61,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT:    scratch_load_ushort v0, v0, off
+; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -92,16 +86,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT:    scratch_load_sshort v0, v0, off
+; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT:    scratch_load_i16 v0, v0, off
+; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -359,16 +351,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_ubyte v2, off, s2
+; GFX10-NEXT:    scratch_load_ubyte v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_u8 v2, off, s0
+; GFX11-NEXT:    scratch_load_u8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -386,16 +376,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 1
-; GFX10-NEXT:    scratch_load_sbyte v2, off, s2
+; GFX10-NEXT:    scratch_load_sbyte v2, off, s2 offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 1
-; GFX11-NEXT:    scratch_load_i8 v2, off, s0
+; GFX11-NEXT:    scratch_load_i8 v2, off, s0 offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -413,16 +401,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
-; GFX10-NEXT:    scratch_load_ushort v2, off, s2
+; GFX10-NEXT:    scratch_load_ushort v2, off, s2 offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
-; GFX11-NEXT:    scratch_load_u16 v2, off, s0
+; GFX11-NEXT:    scratch_load_u16 v2, off, s0 offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -440,16 +426,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    s_add_i32 s2, s2, 2
-; GFX10-NEXT:    scratch_load_sshort v2, off, s2
+; GFX10-NEXT:    scratch_load_sshort v2, off, s2 offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[0:1], v2
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_add_i32 s0, s0, 2
-; GFX11-NEXT:    scratch_load_i16 v2, off, s0
+; GFX11-NEXT:    scratch_load_i16 v2, off, s0 offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX11-NEXT:    s_endpgm
@@ -713,19 +697,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_zext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -745,19 +726,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT:    scratch_load_sbyte v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_sbyte v0, v0, off offset:1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i8_sext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT:    scratch_load_i8 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_i8 v0, v0, off offset:1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -777,19 +755,16 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT:    scratch_load_ushort v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_ushort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_zext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT:    scratch_load_u16 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_u16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
@@ -809,19 +784,16 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT:    scratch_load_sshort v0, v0, off
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT:    scratch_load_sshort v0, v0, off offset:2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_scratch_load_i16_sext_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT:    scratch_load_i16 v0, v0, off
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT:    scratch_load_i16 v0, v0, off offset:2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 07b3df2a8520aae..fe984ffa653a5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -576,11 +576,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -592,24 +591,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -628,8 +625,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -640,8 +636,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -658,24 +653,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v1, 4, v0
+; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 4, v0
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 4, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -798,58 +791,53 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: private_ptr_foo:
 ; GFX9-PAL:       ; %bb.0:
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: private_ptr_foo:
 ; GFX10-PAL:       ; %bb.0:
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: private_ptr_foo:
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -1554,11 +1542,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1572,26 +1559,23 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1610,11 +1594,10 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x104, v0
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -1627,8 +1610,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:260 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -1648,12 +1630,11 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1010-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -1672,26 +1653,23 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1030-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x104, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x104, v0
+; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x104, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -2595,11 +2573,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2613,27 +2590,24 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX10-NEXT:    scratch_store_dword v1, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2652,11 +2626,10 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
-; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -2670,8 +2643,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX940-NEXT:    scratch_store_dword v0, v1, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x7c, v0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -2691,12 +2663,11 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1010-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1010-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
@@ -2715,27 +2686,24 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX1030-PAL-NEXT:    scratch_store_dword v0, v2, off
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
+; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX1030-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v1, 0x4004, v0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, s0 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -2922,15 +2890,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s0, s0, 4
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2959,14 +2925,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
-; GFX940-NEXT:    s_movk_i32 s0, 0x3000
 ; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, 4
-; GFX940-NEXT:    v_mov_b32_e32 v0, 15
-; GFX940-NEXT:    scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -3019,15 +2984,13 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ;
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
 ; GFX11-PAL:       ; %bb.0: ; %bb
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 4
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -3077,16 +3040,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-NEXT:    s_add_i32 s1, s32, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3110,15 +3070,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
-; GFX940-NEXT:    s_movk_i32 s0, 0x3000
-; GFX940-NEXT:    s_add_i32 s1, s32, 4
 ; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    s_add_i32 s0, s0, s1
-; GFX940-NEXT:    v_mov_b32_e32 v0, 15
-; GFX940-NEXT:    scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
+; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3141,16 +3099,13 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
-; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x3000
-; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 4
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-PAL-NEXT:    s_add_i32 s0, s0, s1
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_store_b32 off, v1, s0 offset:3712 dlc
+; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s0 offset:3712 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -3168,17 +3123,16 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-LABEL: store_load_vidx_sidx_offset:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3189,29 +3143,26 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 4
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-NEXT:    v_add3_u32 v0, v1, v0, 0x400
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vidx_sidx_offset:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, v1, v0, 0x400
-; GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -3226,13 +3177,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
-; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0x400, v0
+; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
@@ -3243,11 +3193,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX940-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX940-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
@@ -3263,29 +3212,26 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
 ; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 4
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-PAL-NEXT:    v_add3_u32 v0, v1, v0, 0x400
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off glc dlc
+; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-PAL-NEXT:    v_add3_u32 v0, v1, v0, 0x400
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
@@ -3657,21 +3603,20 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX10-LABEL: store_load_i32_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3697,24 +3642,34 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-PAL-LABEL: store_load_i32_negative_unaligned:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -3739,21 +3694,21 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX10-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX10:       ; %bb.0: ; %bb
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-NEXT:    scratch_store_byte v0, v1, off
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3771,32 +3726,43 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX940-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
+; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 1
-; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-PAL-LABEL: store_load_i32_large_negative_unaligned:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffef7f, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX10-PAL-NEXT:    scratch_store_byte v0, v1, off
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off dlc
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 9df4b903de4bda0..e7d86c0c178e970 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1561,20 +1561,16 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 64
-; GFX11-NEXT:    s_add_i32 s5, s0, 48
-; GFX11-NEXT:    s_add_i32 s6, s0, 32
-; GFX11-NEXT:    s_add_i32 s7, s0, 16
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
@@ -1582,17 +1578,17 @@ define <33 x i32> @v33i32_func_void() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
@@ -1854,20 +1850,16 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
 ; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
 ; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
-; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
-; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
-; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s4, s0, 64
-; GFX11-NEXT:    s_add_i32 s5, s0, 48
-; GFX11-NEXT:    s_add_i32 s6, s0, 32
-; GFX11-NEXT:    s_add_i32 s7, s0, 16
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_add_i32 s4, s0, 48
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
@@ -1875,17 +1867,17 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(6)
 ; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
 ; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s0 offset:64
 ; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s4
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s0 offset:16
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:32
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    scratch_store_b32 off, v33, s0 offset:128
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
@@ -2160,7 +2152,6 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
 ; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
 ; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
 ; GFX11-NEXT:    s_waitcnt vmcnt(8)
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
@@ -2176,7 +2167,7 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0 offset:128
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b32 off, v33, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 0d54da3128a617a..23502d1b36d1828 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -1990,135 +1990,140 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:1024
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:512
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:256
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:128
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:64
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:32
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0 offset:16
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x7f0
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x7e0
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0x7d0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s3
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x7b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x7c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x7a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x790
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x7b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x7a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x780
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x770
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x790
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x780
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x760
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x750
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x770
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x760
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x740
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x730
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x750
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x740
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x720
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x710
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x730
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x720
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x700
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x710
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x700
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x6b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x6a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x690
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x6b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x6a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x680
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x670
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x690
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x680
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x660
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x650
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x670
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x660
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x640
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x630
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x650
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x640
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x620
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x610
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x630
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x620
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x600
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x610
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x600
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x5b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x5a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x590
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x5b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x5a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x580
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x570
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x590
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x580
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x560
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x550
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x570
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x560
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x540
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x530
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x550
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x540
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x520
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x510
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x530
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x520
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x500
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x510
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x500
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x4b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x4a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x490
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x4b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x4a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x480
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x470
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x490
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x480
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x460
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x450
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x470
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x460
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x440
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x430
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x450
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x440
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x420
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x410
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x430
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x420
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x400
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x410
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x3f0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
@@ -2182,39 +2187,35 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x210
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x200
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1f0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1f0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1e0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1e0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1d0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1d0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1c0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1c0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x1b0
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x1b0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x1a0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x1a0
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x190
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x190
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x180
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x180
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x170
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x170
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x160
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x160
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x150
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x150
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x140
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x140
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x130
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x130
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x120
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x120
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x110
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0xf0
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
@@ -2230,20 +2231,12 @@ define amdgpu_gfx <512 x i32> @return_512xi32() #0 {
 ; GFX11-NEXT:    s_add_i32 s2, s0, 0x90
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x80
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s2, s0, 0x50
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 64
-; GFX11-NEXT:    s_add_i32 s2, s0, 48
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s2
-; GFX11-NEXT:    s_add_i32 s1, s0, 32
-; GFX11-NEXT:    s_add_i32 s0, s0, 16
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s0, s0, 48
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2643,141 +2636,122 @@ define amdgpu_gfx <72 x i32> @return_72xi32(<72 x i32> %val) #1 {
 ; GFX11-LABEL: return_72xi32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x10
-; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:220
-; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:216
-; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:212
-; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:208
-; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:204
-; GFX11-NEXT:    scratch_store_b32 off, v45, s32 offset:200
-; GFX11-NEXT:    scratch_store_b32 off, v46, s32 offset:196
-; GFX11-NEXT:    scratch_store_b32 off, v47, s32 offset:192
-; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:188
-; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:184
-; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:180
-; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:176
-; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:172
-; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:168
-; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:164
-; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s32 offset:224
-; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s32 offset:240
-; GFX11-NEXT:    s_clause 0x12
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:160
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:156
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:152
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:148
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:144
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:140
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:136
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:132
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:60
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:72
-; GFX11-NEXT:    v_dual_mov_b32 v27, v24 :: v_dual_mov_b32 v26, v23
-; GFX11-NEXT:    v_dual_mov_b32 v25, v22 :: v_dual_mov_b32 v24, v21
+; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:212
+; GFX11-NEXT:    scratch_store_b32 off, v41, s32 offset:208
+; GFX11-NEXT:    scratch_store_b32 off, v42, s32 offset:204
+; GFX11-NEXT:    scratch_store_b32 off, v43, s32 offset:200
+; GFX11-NEXT:    scratch_store_b32 off, v44, s32 offset:196
+; GFX11-NEXT:    scratch_store_b32 off, v56, s32 offset:192
+; GFX11-NEXT:    scratch_store_b32 off, v57, s32 offset:188
+; GFX11-NEXT:    scratch_store_b32 off, v58, s32 offset:184
+; GFX11-NEXT:    scratch_store_b32 off, v59, s32 offset:180
+; GFX11-NEXT:    scratch_store_b32 off, v60, s32 offset:176
+; GFX11-NEXT:    scratch_store_b32 off, v61, s32 offset:172
+; GFX11-NEXT:    scratch_store_b32 off, v62, s32 offset:168
+; GFX11-NEXT:    scratch_store_b32 off, v63, s32 offset:164
+; GFX11-NEXT:    s_clause 0x14
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:120
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s0 offset:64
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v20, off, s32 offset:144
+; GFX11-NEXT:    scratch_load_b32 v19, off, s32 offset:140
+; GFX11-NEXT:    scratch_load_b32 v18, off, s32 offset:136
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s0 offset:32
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v12, off, s32 offset:160
+; GFX11-NEXT:    scratch_load_b32 v11, off, s32 offset:156
+; GFX11-NEXT:    scratch_load_b32 v10, off, s32 offset:152
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:16
+; GFX11-NEXT:    s_clause 0xd
+; GFX11-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v9, off, s32 offset:148
+; GFX11-NEXT:    scratch_load_b32 v17, off, s32 offset:132
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:68
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
 ; GFX11-NEXT:    s_add_i32 s1, s0, 0x110
 ; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s0
-; GFX11-NEXT:    v_dual_mov_b32 v23, v20 :: v_dual_mov_b32 v22, v19
-; GFX11-NEXT:    v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v20, v17
-; GFX11-NEXT:    v_dual_mov_b32 v19, v16 :: v_dual_mov_b32 v18, v15
-; GFX11-NEXT:    v_dual_mov_b32 v17, v14 :: v_dual_mov_b32 v16, v13
-; GFX11-NEXT:    v_dual_mov_b32 v15, v12 :: v_dual_mov_b32 v14, v11
-; GFX11-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v12, v9
-; GFX11-NEXT:    v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v10, v7
-; GFX11-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v8, v5
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v7, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v6, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v5, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:104
-; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
-; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
-; GFX11-NEXT:    s_add_i32 s34, s0, 0xc0
-; GFX11-NEXT:    s_add_i32 s35, s0, 0xb0
-; GFX11-NEXT:    s_add_i32 s36, s0, 0xa0
-; GFX11-NEXT:    s_add_i32 s37, s0, 0x90
-; GFX11-NEXT:    s_add_i32 s38, s0, 0x80
-; GFX11-NEXT:    s_add_i32 s39, s0, 0x70
-; GFX11-NEXT:    s_add_i32 s40, s0, 0x60
-; GFX11-NEXT:    s_add_i32 s41, s0, 0x50
-; GFX11-NEXT:    s_add_i32 s42, s0, 64
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x100
+; GFX11-NEXT:    s_add_i32 s3, s0, 0xf0
+; GFX11-NEXT:    s_add_i32 s34, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s35, s0, 0xd0
+; GFX11-NEXT:    s_add_i32 s36, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s37, s0, 0xb0
+; GFX11-NEXT:    s_add_i32 s38, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s39, s0, 0x90
+; GFX11-NEXT:    s_add_i32 s40, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s41, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s42, s0, 0x50
 ; GFX11-NEXT:    s_add_i32 s43, s0, 48
-; GFX11-NEXT:    s_add_i32 s44, s0, 32
-; GFX11-NEXT:    s_waitcnt vmcnt(23)
-; GFX11-NEXT:    scratch_store_b128 off, v[45:48], s1
-; GFX11-NEXT:    s_add_i32 s1, s0, 0x100
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:108
-; GFX11-NEXT:    s_waitcnt vmcnt(21)
-; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s1
-; GFX11-NEXT:    s_clause 0xc
-; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v1, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v4, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:4
-; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s32 offset:224
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
-; GFX11-NEXT:    s_add_i32 s0, s0, 16
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s32 offset:224 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
-; GFX11-NEXT:    scratch_store_b128 off, v[59:62], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s34
-; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s35
-; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s36
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s37
-; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s38
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:224 ; 16-byte Folded Reload
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s39
-; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s32 offset:240 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s0 offset:128
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s1
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s2
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b128 off, v[60:63], s3
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    scratch_store_b128 off, v[56:59], s34
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s35
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    scratch_store_b128 off, v[37:40], s36
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s37
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s38
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[33:36], s39
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s40
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s41
-; GFX11-NEXT:    scratch_store_b128 off, v[20:23], s42
-; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s43
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s44
-; GFX11-NEXT:    scratch_store_b128 off, v[8:11], s0
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:164
-; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:168
-; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:172
-; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:176
-; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:180
-; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:184
-; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:188
-; GFX11-NEXT:    scratch_load_b32 v47, off, s32 offset:192
-; GFX11-NEXT:    scratch_load_b32 v46, off, s32 offset:196
-; GFX11-NEXT:    scratch_load_b32 v45, off, s32 offset:200
-; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:204
-; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:208
-; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:212
-; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:216
-; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:220
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s40
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s41
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s42
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s43
+; GFX11-NEXT:    s_clause 0xc
+; GFX11-NEXT:    scratch_load_b32 v63, off, s32 offset:164
+; GFX11-NEXT:    scratch_load_b32 v62, off, s32 offset:168
+; GFX11-NEXT:    scratch_load_b32 v61, off, s32 offset:172
+; GFX11-NEXT:    scratch_load_b32 v60, off, s32 offset:176
+; GFX11-NEXT:    scratch_load_b32 v59, off, s32 offset:180
+; GFX11-NEXT:    scratch_load_b32 v58, off, s32 offset:184
+; GFX11-NEXT:    scratch_load_b32 v57, off, s32 offset:188
+; GFX11-NEXT:    scratch_load_b32 v56, off, s32 offset:192
+; GFX11-NEXT:    scratch_load_b32 v44, off, s32 offset:196
+; GFX11-NEXT:    scratch_load_b32 v43, off, s32 offset:200
+; GFX11-NEXT:    scratch_load_b32 v42, off, s32 offset:204
+; GFX11-NEXT:    scratch_load_b32 v41, off, s32 offset:208
+; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:212
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret <72 x i32> %val
@@ -3334,12 +3308,12 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s45, s33
+; GFX11-NEXT:    s_mov_b32 s46, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v32, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3349,8 +3323,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xf
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:60
+; GFX11-NEXT:    s_clause 0xe
 ; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
 ; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
 ; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:48
@@ -3388,7 +3361,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v32, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
@@ -3405,110 +3378,114 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v26, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v28, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
-; GFX11-NEXT:    s_mov_b32 s47, return_72xi32 at abs32@hi
-; GFX11-NEXT:    s_mov_b32 s46, return_72xi32 at abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v32, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; GFX11-NEXT:    s_clause 0xb
-; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:624
-; GFX11-NEXT:    scratch_load_b128 v[26:29], off, s33 offset:640
+; GFX11-NEXT:    s_mov_b32 s45, return_72xi32 at abs32@hi
+; GFX11-NEXT:    s_mov_b32 s44, return_72xi32 at abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
+; GFX11-NEXT:    scratch_load_b128 v[33:36], off, s33 offset:640
+; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v32, v48
+; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
 ; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
-; GFX11-NEXT:    scratch_load_b128 v[40:43], off, s33 offset:688
-; GFX11-NEXT:    scratch_load_b128 v[44:47], off, s33 offset:704
-; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:720
-; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:736
+; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:688
+; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
+; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
+; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:736
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
 ; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
 ; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:512
-; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
-; GFX11-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-NEXT:    v_dual_mov_b32 v31, v50 :: v_dual_mov_b32 v30, v49
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52
 ; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_dual_mov_b32 v49, v40 :: v_dual_mov_b32 v50, v41
-; GFX11-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-NEXT:    v_dual_mov_b32 v41, v56 :: v_dual_mov_b32 v40, v47
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_dual_mov_b32 v47, v2 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v27
+; GFX11-NEXT:    v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:528
-; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:544
-; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:560
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:576
-; GFX11-NEXT:    v_dual_mov_b32 v39, v28 :: v_dual_mov_b32 v28, v29
-; GFX11-NEXT:    v_dual_mov_b32 v29, v48 :: v_dual_mov_b32 v48, v55
-; GFX11-NEXT:    v_dual_mov_b32 v55, v46 :: v_dual_mov_b32 v46, v1
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v4, v7
-; GFX11-NEXT:    v_mov_b32_e32 v5, v8
-; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v56, v59
+; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:528
+; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
+; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
+; GFX11-NEXT:    v_mov_b32_e32 v56, v63
+; GFX11-NEXT:    v_mov_b32_e32 v12, v15
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2
+; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
 ; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_mov_b32_e32 v8, v15
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_dual_mov_b32 v10, v17 :: v_dual_mov_b32 v15, v22
+; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_mov_b32_e32 v10, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1572 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:592
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:592
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1556 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:608
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:608
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b128 off, v[24:27], s33 offset:1540 ; 16-byte Folded Spill
-; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s32
-; GFX11-NEXT:    v_dual_mov_b32 v37, v52 :: v_dual_mov_b32 v38, v53
-; GFX11-NEXT:    v_mov_b32_e32 v39, v54
-; GFX11-NEXT:    v_dual_mov_b32 v53, v44 :: v_dual_mov_b32 v54, v45
-; GFX11-NEXT:    v_dual_mov_b32 v44, v63 :: v_dual_mov_b32 v45, v0
-; GFX11-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v3, v6
-; GFX11-NEXT:    v_mov_b32_e32 v6, v9
+; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
+; GFX11-NEXT:    v_mov_b32_e32 v32, v36
+; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
+; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51
+; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41
+; GFX11-NEXT:    v_mov_b32_e32 v50, v42
+; GFX11-NEXT:    v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13
+; GFX11-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
 ; GFX11-NEXT:    scratch_store_b32 off, v11, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
-; GFX11-NEXT:    v_dual_mov_b32 v36, v51 :: v_dual_mov_b32 v51, v42
-; GFX11-NEXT:    v_mov_b32_e32 v52, v43
+; GFX11-NEXT:    v_mov_b32_e32 v51, v43
+; GFX11-NEXT:    v_mov_b32_e32 v41, v59
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
+; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    v_mov_b32_e32 v42, v57
+; GFX11-NEXT:    v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v5, v12
+; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
-; GFX11-NEXT:    v_mov_b32_e32 v43, v58
-; GFX11-NEXT:    v_dual_mov_b32 v57, v60 :: v_dual_mov_b32 v58, v61
-; GFX11-NEXT:    scratch_store_b128 off, v[44:47], s0
+; GFX11-NEXT:    v_mov_b32_e32 v5, v16
+; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
-; GFX11-NEXT:    v_dual_mov_b32 v6, v13 :: v_dual_mov_b32 v7, v14
+; GFX11-NEXT:    v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    v_mov_b32_e32 v9, v16
+; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
 ; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
-; GFX11-NEXT:    v_mov_b32_e32 v11, v18
-; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v24
+; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_dual_mov_b32 v12, v19 :: v_dual_mov_b32 v13, v20
+; GFX11-NEXT:    v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47
 ; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 48
-; GFX11-NEXT:    v_mov_b32_e32 v14, v21
+; GFX11-NEXT:    v_mov_b32_e32 v15, v26
 ; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 32
-; GFX11-NEXT:    v_mov_b32_e32 v16, v23
+; GFX11-NEXT:    v_mov_b32_e32 v16, v27
 ; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 16
-; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s0
-; GFX11-NEXT:    v_mov_b32_e32 v29, v33
-; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588
+; GFX11-NEXT:    v_mov_b32_e32 v30, v46
+; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
+; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, 42
+; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    scratch_load_b128 v[17:20], off, s33 offset:1572
 ; GFX11-NEXT:    scratch_load_b128 v[21:24], off, s33 offset:1556
 ; GFX11-NEXT:    scratch_load_b128 v[25:28], off, s33 offset:1540
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x400
-; GFX11-NEXT:    v_dual_mov_b32 v30, v34 :: v_dual_mov_b32 v31, v35
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 42
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[46:47]
-; GFX11-NEXT:    s_clause 0xf
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
+; GFX11-NEXT:    s_clause 0xe
 ; GFX11-NEXT:    scratch_load_b32 v63, off, s33
 ; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
 ; GFX11-NEXT:    scratch_load_b32 v61, off, s33 offset:8
@@ -3524,14 +3501,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:48
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:52
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:56
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:60
-; GFX11-NEXT:    v_readlane_b32 s31, v32, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v32, 0
-; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v32, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s45
+; GFX11-NEXT:    s_mov_b32 s33, s46
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 6f419ab2cc67eba..133b8ec6a34d071 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -208,26 +208,20 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr
 ; GCN-SCRATCH-NEXT:    v_lshlrev_b32_e32 v2, 4, v31
 ; GCN-SCRATCH-NEXT:    v_and_b32_e32 v18, 0x3ff0, v2
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v0, v18
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v6, 16, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v10, 32, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v14, 48, v0
 ; GCN-SCRATCH-NEXT:    s_clause 0x3
 ; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[2:5], v0, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v6, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v10, off
-; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v14, off
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[6:9], v0, off offset:16
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[10:13], v0, off offset:32
+; GCN-SCRATCH-NEXT:    scratch_load_dwordx4 v[14:17], v0, off offset:48
 ; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v0, v1, v18
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v1, 16, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v18, 32, v0
-; GCN-SCRATCH-NEXT:    v_add_nc_u32_e32 v19, 48, v0
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[2:5], off
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(2)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v1, v[6:9], off
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[6:9], off offset:16
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v18, v[10:13], off
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[10:13], off offset:32
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v19, v[14:17], off
+; GCN-SCRATCH-NEXT:    scratch_store_dwordx4 v0, v[14:17], off offset:48
 ; GCN-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

>From b6c8367c6a6dc31a57f1e4d7f75a959589006ae4 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 1 Nov 2023 11:21:12 +0800
Subject: [PATCH 2/7] Further update the change:

1. Add GlobalISel path change.
2. Make the isFlatScratchBaseLegal() accept two `base`s, thus can handle
   the `sgpr+vgpr+imm` case in one place.
3. Fix one issue the previous change mis-handles the
   store_load_i32_large_negative_unaligned test, which has large
   negative offset that cannot be put into register.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  39 ++--
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   2 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  46 +++-
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   3 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 202 ++++++++----------
 .../GlobalISel/inst-select-load-private.mir   |  72 +++++--
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |   6 +-
 7 files changed, 216 insertions(+), 154 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 691d644badd24b5..42f507369642daf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,26 +1146,35 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base,
+// This is used to check whether the address of flat scratch load/store in the
+// form of `base1 + (base2) + immediate offset` is legal with respect to the
+// hardware's requirement that the SGPR/VGPR address offset should be unsigned.
+// The `base` parts are those that would go into SGPR or VGPR.
+// When value in 32-bit `base` can be negative calculate scratch offset using
+// 32-bit add instruction, otherwise use `base(unsigned) + offset`.
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr, SDValue Base1,
+                                                SDValue Base2,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  if (Addr.getOpcode() == ISD::ADD) {
+  assert(Base1.getNode());
+  if (FullAddr.getOpcode() == ISD::ADD && !Base2.getNode()) {
     // For `nuw` addition, we should not have negative base address.
-    if (Addr->getFlags().hasNoUnsignedWrap())
+    if (FullAddr->getFlags().hasNoUnsignedWrap())
       return true;
 
-    auto *RHS = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
-    // If the immediate offset is negative, we should not have the base being
-    // negative as well.
-    if (RHS && RHS->getSExtValue() < 0)
+    assert(isa<ConstantSDNode>(FullAddr->getOperand(1)));
+    // If the immediate offset is negative, the base address cannot also be
+    // negative.
+    if (CurDAG->computeKnownBits(FullAddr.getOperand(1)).isNegative())
       return true;
   }
 
-  // When value in 32-bit Base can be negative calculate scratch offset using
-  // 32-bit add instruction, otherwise use Base(unsigned) + offset.
-  return CurDAG->SignBitIsZero(Base);
+  if (!Base2.getNode())
+    return CurDAG->SignBitIsZero(Base1);
+
+  return CurDAG->SignBitIsZero(Base1) && CurDAG->SignBitIsZero(Base2);
 }
 
 // TODO: If offset is too big, put low 16-bit into offset.
@@ -1562,7 +1571,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
-        isFlatScratchBaseLegal(Addr, N0, FlatVariant)) {
+        isFlatScratchBaseLegal(Addr, N0, SDValue(), FlatVariant)) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1795,7 +1804,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   int64_t COffsetVal = 0;
 
   if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr, Addr.getOperand(0))) {
+      isFlatScratchBaseLegal(Addr, Addr.getOperand(0), SDValue())) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {
@@ -1852,6 +1861,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   int64_t ImmOffset = 0;
 
   SDValue LHS, RHS;
+  SDValue FullAddr = Addr;
   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
     const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1873,7 +1883,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
-        if (!isFlatScratchBaseLegal(Addr, SAddr))
+        if (!isFlatScratchBaseLegal(Addr, SAddr, SDValue()))
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
@@ -1899,8 +1909,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(Addr, SAddr) ||
-      !isFlatScratchBaseLegal(Addr, VAddr))
+  if (!isFlatScratchBaseLegal(FullAddr, SAddr, VAddr))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8a47757f70bbfbc..15cbd26ee2bcfae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue Addr, SDValue Base,
+      SDValue FullAddr, SDValue Base1, SDValue Base2,
       uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8a6..46415eae09d46db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4095,7 +4095,11 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
   int64_t ConstOffset;
   std::tie(PtrBase, ConstOffset) =
       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
-  if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
+
+  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
+  if (ConstOffset == 0 ||
+      !isFlatScratchBaseLegal(*AddrDef->MI, PtrBase, AMDGPU::NoRegister,
+                              FlatVariant))
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4257,15 +4261,17 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
 
-  if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
+  if (ConstOffset != 0 &&
+      isFlatScratchBaseLegal(*AddrDef->MI, PtrBase, AMDGPU::NoRegister) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
                             SIInstrFlags::FlatScratch)) {
     Addr = PtrBase;
     ImmOffset = ConstOffset;
+    AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   }
 
-  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = AddrDef->MI->getOperand(1).getIndex();
     return {{
@@ -4335,6 +4341,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
 
+  Register FullAddr = Addr;
   if (ConstOffset != 0 &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
     Addr = PtrBase;
@@ -4352,7 +4359,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   Register LHS = AddrDef->MI->getOperand(1).getReg();
   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
 
-  if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
+  auto FullAddrDef = getDefSrcRegIgnoringCopies(FullAddr, *MRI);
+  if (!isFlatScratchBaseLegal(*FullAddrDef->MI, LHS, RHS))
     return std::nullopt;
 
   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4486,14 +4494,36 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
   return KB->signBitIsZero(Base);
 }
 
+// This is used to check whether the address of flat scratch load/store in the
+// form of `base1 + (base2) + immediate offset` is legal with respect to the
+// hardware's requirement that the SGPR/VGPR address offset should be unsigned.
+// The `base` parts are those that would go into SGPR or VGPR.
+// When value in 32-bit `base` can be negative calculate scratch offset using
+// 32-bit add instruction, otherwise use `base(unsigned) + offset`.
 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
-    Register Base, uint64_t FlatVariant) const {
+    MachineInstr &AddrDef, Register Base1, Register Base2,
+    uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  // When value in 32-bit Base can be negative calculate scratch offset using
-  // 32-bit add instruction, otherwise use Base(unsigned) + offset.
-  return KB->signBitIsZero(Base);
+  if (AddrDef.getOpcode() == TargetOpcode::G_PTR_ADD &&
+      Base2 == AMDGPU::NoRegister) {
+    // For `nuw` addition, we should not have negative base address.
+    if (AddrDef.getFlag(MachineInstr::NoUWrap))
+      return true;
+
+    MachineOperand &OffsetOp = AddrDef.getOperand(2);
+    assert(getIConstantVRegValWithLookThrough(OffsetOp.getReg(), *MRI));
+    // If the immediate offset is negative, the base address cannot also be
+    // negative.
+    if (KB->getKnownOnes(OffsetOp.getReg()).isNegative())
+      return true;
+  }
+
+  if (Base2 == AMDGPU::NoRegister)
+    return KB->signBitIsZero(Base1);
+
+  return KB->signBitIsZero(Base1) && KB->signBitIsZero(Base2);
 }
 
 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..cbc194f27c19a84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,7 +244,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+      MachineInstr &AddrDef, Register Base1, Register Base2,
+      uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   std::pair<Register, unsigned>
   selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 45df3bc094f351e..ec2cd43e5fb5df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -89,17 +89,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
-; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 4
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -111,42 +109,39 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT:    v_add3_u32 v1, 4, v1, v2
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 4
-; GFX940-NEXT:    scratch_store_dword v1, v3, off offset:4 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    scratch_store_b32 v0, v3, off offset:4 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_add3_u32 v1, 4, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -233,34 +228,31 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
 ; GFX9-LABEL: private_ptr_foo:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: private_ptr_foo:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: private_ptr_foo:
 ; GFX940:       ; %bb.0:
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: private_ptr_foo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
   store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -366,16 +358,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -387,16 +377,15 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x104, v0
-; GFX10-NEXT:    v_add3_u32 v1, 0x104, v1, v2
-; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -405,30 +394,27 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x104
-; GFX940-NEXT:    scratch_store_dword v1, v3, off offset:260 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, off offset:260 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x104, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    v_add3_u32 v1, 0x104, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 v0, v3, off offset:260 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:260 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -631,16 +617,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4004
-; GFX9-NEXT:    scratch_store_dword v1, v3, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_store_dword v1, v2, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 0x4004, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -652,16 +636,15 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x4004, v0
-; GFX10-NEXT:    v_add3_u32 v1, 0x4004, v1, v2
-; GFX10-NEXT:    scratch_load_dword v2, off, off offset:4 glc dlc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    scratch_store_dword v0, v3, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v1, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -670,32 +653,29 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v3, 15
-; GFX940-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x4004
-; GFX940-NEXT:    scratch_store_dword v1, v3, s0 sc0 sc1
-; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v2, 15
+; GFX940-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT:    v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, s0 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_add_u32_e32 v0, 0x4004, v0
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT:    v_add3_u32 v1, 0x4004, v1, v2
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 v0, v3, s0 dlc
+; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT:    scratch_store_b32 v0, v2, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
+; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
@@ -945,16 +925,14 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-NEXT:    v_mov_b32_e32 v1, 4
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x400
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX9-NEXT:    v_add3_u32 v0, v1, v0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -965,44 +943,40 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x400
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX10-NEXT:    v_add3_u32 v0, 4, v0, v1
-; GFX10-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    scratch_load_dword v0, v0, off glc dlc
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vidx_sidx_offset:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b32_e32 v1, 4
-; GFX940-NEXT:    v_mov_b32_e32 v2, 0x400
+; GFX940-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX940-NEXT:    v_add3_u32 v0, v1, v0, v2
-; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT:    v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vidx_sidx_offset:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x400
+; GFX11-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v0, 4, v0, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 4, v0
+; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v0, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 4bf6b6a623d8269..4c4014de4348e12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -25,12 +25,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -63,12 +65,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -101,12 +105,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -139,12 +145,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_p3_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_p3_from_4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -177,12 +185,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p5), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_p5_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p5), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_p5_from_4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -216,12 +226,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_v2s16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -260,12 +272,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_2047
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -304,6 +318,7 @@ body: |
     ; GFX6-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -312,6 +327,7 @@ body: |
     ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -353,12 +369,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_2048
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -397,6 +415,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -405,13 +424,12 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m2047
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2047, implicit $exec
-    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2047, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2047
@@ -443,6 +461,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -451,13 +470,12 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m2048
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -2048, implicit $exec
-    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -2048, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -2048
@@ -489,12 +507,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_4095
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -533,6 +553,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -541,6 +562,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -579,6 +601,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -587,13 +610,12 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m4095
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4095, implicit $exec
-    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4095
@@ -625,6 +647,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -633,13 +656,12 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m4096
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -4096, implicit $exec
-    ; GFX11-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_ADD_U32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
+    ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[COPY]], -4096, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE]]
     %0:vgpr(p5) = COPY $vgpr0
     %1:vgpr(s32) = G_CONSTANT i32 -4096
@@ -671,6 +693,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -679,6 +702,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -717,6 +741,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -725,6 +750,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -763,6 +789,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -771,6 +798,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -809,6 +837,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -817,6 +846,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -849,9 +879,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_0
     ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_0
     ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_4_constant_0
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
@@ -878,9 +910,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16
     ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16
     ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_4_constant_sgpr_16
     ; GFX11: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 16
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
@@ -907,9 +941,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095
     ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_constant_4095
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
@@ -937,10 +973,12 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
@@ -969,9 +1007,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_fi
     ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_fi
     ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_fi
     ; GFX11: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]]
@@ -998,9 +1038,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
@@ -1030,9 +1072,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
@@ -1066,12 +1110,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_4096
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE_SVS:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SVS [[V_MOV_B32_e32_]], %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
@@ -1102,10 +1148,12 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_neg1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_neg1
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fe984ffa653a5ef..f1a2fe886775338 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -3726,11 +3726,11 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX940-LABEL: store_load_i32_large_negative_unaligned:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    s_movk_i32 s0, 0xef7f
+; GFX940-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v1, 1
-; GFX940-NEXT:    scratch_store_byte v0, v1, s0 sc0 sc1
+; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_ubyte v0, v0, s0 sc0 sc1
+; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;

>From a5b95f5e76511dd3f57df83a8cd8f47442db601f Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 1 Nov 2023 22:23:09 +0800
Subject: [PATCH 3/7] Handle pointer OR case

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 13 ++-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 15 ++--
 llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll  | 80 ++++++++-----------
 3 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 42f507369642daf..7d8ee2fb2db9841 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1159,7 +1159,12 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr, SDValue Base1,
     return true;
 
   assert(Base1.getNode());
-  if (FullAddr.getOpcode() == ISD::ADD && !Base2.getNode()) {
+  // TODO: We could possibly improve this with more complex checking when we
+  // have both SGPR and VGPR.
+  if (Base2.getNode())
+    return CurDAG->SignBitIsZero(Base1) && CurDAG->SignBitIsZero(Base2);
+
+  if (FullAddr.getOpcode() == ISD::ADD) {
     // For `nuw` addition, we should not have negative base address.
     if (FullAddr->getFlags().hasNoUnsignedWrap())
       return true;
@@ -1171,10 +1176,10 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr, SDValue Base1,
       return true;
   }
 
-  if (!Base2.getNode())
-    return CurDAG->SignBitIsZero(Base1);
+  if (FullAddr.getOpcode() == ISD::OR)
+    return true;
 
-  return CurDAG->SignBitIsZero(Base1) && CurDAG->SignBitIsZero(Base2);
+  return CurDAG->SignBitIsZero(Base1);
 }
 
 // TODO: If offset is too big, put low 16-bit into offset.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 46415eae09d46db..048aef09e9fec58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
@@ -4506,8 +4507,12 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  if (AddrDef.getOpcode() == TargetOpcode::G_PTR_ADD &&
-      Base2 == AMDGPU::NoRegister) {
+  // TODO: We could possibly improve this with more complex checking when we
+  // have both SGPR and VGPR.
+  if (Base2 != AMDGPU::NoRegister)
+    return KB->signBitIsZero(Base1) && KB->signBitIsZero(Base2);
+
+  if (AddrDef.getOpcode() == TargetOpcode::G_PTR_ADD) {
     // For `nuw` addition, we should not have negative base address.
     if (AddrDef.getFlag(MachineInstr::NoUWrap))
       return true;
@@ -4520,10 +4525,10 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
       return true;
   }
 
-  if (Base2 == AMDGPU::NoRegister)
-    return KB->signBitIsZero(Base1);
+  if (AddrDef.getOpcode() == TargetOpcode::G_OR)
+    return true;
 
-  return KB->signBitIsZero(Base1) && KB->signBitIsZero(Base2);
+  return KB->signBitIsZero(Base1);
 }
 
 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 2e01c0064ef7c1c..8d28c7845167a2f 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -401,12 +401,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
-; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 2, v0
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
@@ -446,14 +445,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -503,12 +501,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 2, v0
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
@@ -548,14 +545,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -702,12 +698,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 1, v1
-; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_add_u32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, 2, v0
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 2
+; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
@@ -747,14 +742,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v4, 2, v0
+; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v5, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v2, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v3, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -800,16 +794,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-SDAG-NEXT:    s_load_dword s0, s[0:1], 0x24
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-SDAG-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX940-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v1, s0, v1
 ; GFX940-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX940-SDAG-NEXT:    v_or_b32_e32 v1, 1, v0
-; GFX940-SDAG-NEXT:    v_or_b32_e32 v3, 2, v0
-; GFX940-SDAG-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v2, off offset:1 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 2
-; GFX940-SDAG-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
+; GFX940-SDAG-NEXT:    scratch_store_byte v0, v3, off offset:2 sc0 sc1
 ; GFX940-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-SDAG-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-SDAG-NEXT:    v_mov_b32_e32 v1, 4
@@ -843,20 +835,18 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 1, v0
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 2, v0
-; GFX11-SDAG-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v4, v1, off dlc
+; GFX11-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v3, 4, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v5, v2, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v3, v4, off dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;

>From e1791d477a3f549a68d9a0f521ae59648e62e09d Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Mon, 6 Nov 2023 15:59:41 +0800
Subject: [PATCH 4/7] Support checking two instructions and refine the code.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 72 ++++++++++++-------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  3 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 63 +++++++++-------
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |  3 +-
 4 files changed, 86 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7d8ee2fb2db9841..268a474cf7c0a1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,40 +1146,58 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
   return CurDAG->SignBitIsZero(Base);
 }
 
-// This is used to check whether the address of flat scratch load/store in the
-// form of `base1 + (base2) + immediate offset` is legal with respect to the
-// hardware's requirement that the SGPR/VGPR address offset should be unsigned.
-// The `base` parts are those that would go into SGPR or VGPR.
-// When value in 32-bit `base` can be negative calculate scratch offset using
-// 32-bit add instruction, otherwise use `base(unsigned) + offset`.
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr, SDValue Base1,
-                                                SDValue Base2,
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p CheckTwoInstrs is set, we will check
+// against the last two instructions which calculate \p FullAddr. When \p
+// CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction before the last).
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr,
+                                                bool CheckTwoInstrs,
+                                                bool CheckTwoOperands,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  assert(Base1.getNode());
-  // TODO: We could possibly improve this with more complex checking when we
-  // have both SGPR and VGPR.
-  if (Base2.getNode())
-    return CurDAG->SignBitIsZero(Base1) && CurDAG->SignBitIsZero(Base2);
+  // whether we can prove the operands are non-negative from operation.
+  auto HasOnlyNonNegativeOperands = [](SDValue Addr) -> bool {
+    return (Addr.getOpcode() == ISD::ADD &&
+            Addr->getFlags().hasNoUnsignedWrap()) ||
+           Addr->getOpcode() == ISD::OR;
+  };
 
-  if (FullAddr.getOpcode() == ISD::ADD) {
-    // For `nuw` addition, we should not have negative base address.
-    if (FullAddr->getFlags().hasNoUnsignedWrap())
+  if (CheckTwoInstrs) {
+    auto PartAddr = FullAddr.getOperand(0);
+    // Make sure we are doing SGPR + VGPR + Imm.
+    assert(isa<ConstantSDNode>(FullAddr.getOperand(1)));
+    if (HasOnlyNonNegativeOperands(FullAddr) &&
+        HasOnlyNonNegativeOperands(PartAddr))
       return true;
 
-    assert(isa<ConstantSDNode>(FullAddr->getOperand(1)));
-    // If the immediate offset is negative, the base address cannot also be
-    // negative.
-    if (CurDAG->computeKnownBits(FullAddr.getOperand(1)).isNegative())
-      return true;
+    auto LHS = PartAddr.getOperand(0);
+    auto RHS = PartAddr.getOperand(1);
+    return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
   }
 
-  if (FullAddr.getOpcode() == ISD::OR)
+  // Single instruction case
+  if (HasOnlyNonNegativeOperands(FullAddr))
     return true;
 
-  return CurDAG->SignBitIsZero(Base1);
+  auto LHS = FullAddr.getOperand(0);
+  auto RHS = FullAddr.getOperand(1);
+  if (CheckTwoOperands)
+    return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+
+  // If the immediate offset is negative, the base address cannot also be
+  // negative.
+  ConstantSDNode *ImmOp = nullptr;
+  if (FullAddr.getOpcode() == ISD::ADD &&
+      (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
+    if (ImmOp->getSExtValue() < 0)
+      return true;
+  }
+
+  return CurDAG->SignBitIsZero(LHS);
 }
 
 // TODO: If offset is too big, put low 16-bit into offset.
@@ -1576,7 +1594,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1) &&
-        isFlatScratchBaseLegal(Addr, N0, SDValue(), FlatVariant)) {
+        isFlatScratchBaseLegal(Addr, false, false, FlatVariant)) {
       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1809,7 +1827,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   int64_t COffsetVal = 0;
 
   if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr, Addr.getOperand(0), SDValue())) {
+      isFlatScratchBaseLegal(Addr)) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {
@@ -1888,7 +1906,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
-        if (!isFlatScratchBaseLegal(Addr, SAddr, SDValue()))
+        if (!isFlatScratchBaseLegal(Addr))
           return false;
         if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
           return false;
@@ -1914,7 +1932,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(FullAddr, SAddr, VAddr))
+  if (!isFlatScratchBaseLegal(FullAddr, FullAddr != Addr, true))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 15cbd26ee2bcfae..08f393ab9fae8d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue FullAddr, SDValue Base1, SDValue Base2,
+      SDValue FullAddr, bool CheckTwoInstrs = false,
+      bool CheckTwoOperands = false,
       uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 048aef09e9fec58..a2972059de311c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4099,8 +4099,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
 
   auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
   if (ConstOffset == 0 ||
-      !isFlatScratchBaseLegal(*AddrDef->MI, PtrBase, AMDGPU::NoRegister,
-                              FlatVariant))
+      !isFlatScratchBaseLegal(*AddrDef->MI, nullptr, false, FlatVariant))
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4264,8 +4263,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
   auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
 
-  if (ConstOffset != 0 &&
-      isFlatScratchBaseLegal(*AddrDef->MI, PtrBase, AMDGPU::NoRegister) &&
+  if (ConstOffset != 0 && isFlatScratchBaseLegal(*AddrDef->MI) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
                             SIInstrFlags::FlatScratch)) {
     Addr = PtrBase;
@@ -4361,7 +4359,8 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
 
   auto FullAddrDef = getDefSrcRegIgnoringCopies(FullAddr, *MRI);
-  if (!isFlatScratchBaseLegal(*FullAddrDef->MI, LHS, RHS))
+  if (!isFlatScratchBaseLegal(*FullAddrDef->MI,
+                              FullAddr != Addr ? AddrDef->MI : nullptr, true))
     return std::nullopt;
 
   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4495,40 +4494,52 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
   return KB->signBitIsZero(Base);
 }
 
-// This is used to check whether the address of flat scratch load/store in the
-// form of `base1 + (base2) + immediate offset` is legal with respect to the
-// hardware's requirement that the SGPR/VGPR address offset should be unsigned.
-// The `base` parts are those that would go into SGPR or VGPR.
-// When value in 32-bit `base` can be negative calculate scratch offset using
-// 32-bit add instruction, otherwise use `base(unsigned) + offset`.
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p PartAddr is set, we will check
+// against both instructions to be sure the address are non-negative. When
+// \p CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction \p PartAddr).
 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
-    MachineInstr &AddrDef, Register Base1, Register Base2,
+    MachineInstr &FullAddr, MachineInstr *PartAddr, bool CheckTwoOperands,
     uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  // TODO: We could possibly improve this with more complex checking when we
-  // have both SGPR and VGPR.
-  if (Base2 != AMDGPU::NoRegister)
-    return KB->signBitIsZero(Base1) && KB->signBitIsZero(Base2);
+  // whether we can prove the operands are non-negative from operation.
+  auto HasOnlyNonNegativeOperands = [](MachineInstr *Addr) -> bool {
+    return Addr->getOpcode() == TargetOpcode::G_OR ||
+           (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
+            Addr->getFlag(MachineInstr::NoUWrap));
+  };
 
-  if (AddrDef.getOpcode() == TargetOpcode::G_PTR_ADD) {
-    // For `nuw` addition, we should not have negative base address.
-    if (AddrDef.getFlag(MachineInstr::NoUWrap))
+  if (PartAddr) {
+    if (HasOnlyNonNegativeOperands(&FullAddr) &&
+        HasOnlyNonNegativeOperands(PartAddr))
       return true;
+    Register LHS = PartAddr->getOperand(1).getReg();
+    Register RHS = PartAddr->getOperand(2).getReg();
+    return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+  }
+
+  // Single instruction case
+  if (HasOnlyNonNegativeOperands(&FullAddr))
+    return true;
+
+  Register LHS = FullAddr.getOperand(1).getReg();
+  Register RHS = FullAddr.getOperand(2).getReg();
+  if (CheckTwoOperands)
+    return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
 
-    MachineOperand &OffsetOp = AddrDef.getOperand(2);
-    assert(getIConstantVRegValWithLookThrough(OffsetOp.getReg(), *MRI));
+  if (FullAddr.getOpcode() == TargetOpcode::G_PTR_ADD) {
+    auto RhsValReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
     // If the immediate offset is negative, the base address cannot also be
     // negative.
-    if (KB->getKnownOnes(OffsetOp.getReg()).isNegative())
+    if (RhsValReg && RhsValReg->Value.getSExtValue() < 0)
       return true;
   }
 
-  if (AddrDef.getOpcode() == TargetOpcode::G_OR)
-    return true;
-
-  return KB->signBitIsZero(Base1);
+  return KB->signBitIsZero(LHS);
 }
 
 bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index cbc194f27c19a84..53e5fb995fc041e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,7 +244,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      MachineInstr &AddrDef, Register Base1, Register Base2,
+      MachineInstr &FullAddr, MachineInstr *PartAddr = nullptr,
+      bool CheckTwoOperands = false,
       uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   std::pair<Register, unsigned>

>From cb0a19d7b06d37217db92f167381f01fef0488a7 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Mon, 6 Nov 2023 20:41:52 +0800
Subject: [PATCH 5/7] Fix clang-format issue

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 268a474cf7c0a1f..17f3ef2be9297a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1826,8 +1826,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
 
   int64_t COffsetVal = 0;
 
-  if (CurDAG->isBaseWithConstantOffset(Addr) &&
-      isFlatScratchBaseLegal(Addr)) {
+  if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
   } else {

>From e11a6b2250b724e5fe50d90c68fb64615a8f5650 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 8 Nov 2023 11:48:21 +0800
Subject: [PATCH 6/7] Address review comments

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 41 +++++++------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  3 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 60 +++++++++++--------
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |  3 +-
 4 files changed, 60 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 17f3ef2be9297a4..189ad2a190becac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1149,17 +1149,19 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
 // Check that the address value of flat scratch load/store being put into
 // SGPR/VGPR is legal with respect to hardware's requirement that address in
 // SGPR/VGPR should be unsigned. When \p CheckTwoInstrs is set, we will check
-// against the last two instructions which calculate \p FullAddr. When \p
-// CheckTwoOperands is set, we will check both operands (In case of two
-// instructions, they are the operands from the instruction before the last).
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr,
+// against the instruction that defines \p Addr as well as the instruction that
+// defines the base address. When \p CheckTwoOperands is set, we will check both
+// operands (In case of two instructions, they are the operands from the
+// instruction that defines the base address).
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr,
                                                 bool CheckTwoInstrs,
                                                 bool CheckTwoOperands,
                                                 uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  // whether we can prove the operands are non-negative from operation.
+  // Whether we can infer the operands are non-negative if the result is
+  // non-negative.
   auto HasOnlyNonNegativeOperands = [](SDValue Addr) -> bool {
     return (Addr.getOpcode() == ISD::ADD &&
             Addr->getFlags().hasNoUnsignedWrap()) ||
@@ -1167,31 +1169,34 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr,
   };
 
   if (CheckTwoInstrs) {
-    auto PartAddr = FullAddr.getOperand(0);
+    auto Base = Addr.getOperand(0);
     // Make sure we are doing SGPR + VGPR + Imm.
-    assert(isa<ConstantSDNode>(FullAddr.getOperand(1)));
-    if (HasOnlyNonNegativeOperands(FullAddr) &&
-        HasOnlyNonNegativeOperands(PartAddr))
+    assert(isa<ConstantSDNode>(Addr.getOperand(1)));
+    auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
+    if (HasOnlyNonNegativeOperands(Base) &&
+        (HasOnlyNonNegativeOperands(Addr) || RHSImm->getSExtValue() < 0))
       return true;
 
-    auto LHS = PartAddr.getOperand(0);
-    auto RHS = PartAddr.getOperand(1);
+    auto LHS = Base.getOperand(0);
+    auto RHS = Base.getOperand(1);
     return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
   }
 
   // Single instruction case
-  if (HasOnlyNonNegativeOperands(FullAddr))
+  if (HasOnlyNonNegativeOperands(Addr))
     return true;
 
-  auto LHS = FullAddr.getOperand(0);
-  auto RHS = FullAddr.getOperand(1);
+  auto LHS = Addr.getOperand(0);
+  auto RHS = Addr.getOperand(1);
   if (CheckTwoOperands)
     return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
 
   // If the immediate offset is negative, the base address cannot also be
-  // negative.
+  // negative. Although this is not strictly true in theory (in the case of
+  // wrap around), but given the range of legal immediate offset, this is true
+  // to get a final valid scratch address.
   ConstantSDNode *ImmOp = nullptr;
-  if (FullAddr.getOpcode() == ISD::ADD &&
+  if (Addr.getOpcode() == ISD::ADD &&
       (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
     if (ImmOp->getSExtValue() < 0)
       return true;
@@ -1883,7 +1888,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
   int64_t ImmOffset = 0;
 
   SDValue LHS, RHS;
-  SDValue FullAddr = Addr;
+  SDValue OrigAddr = Addr;
   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
     const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1931,7 +1936,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
-  if (!isFlatScratchBaseLegal(FullAddr, FullAddr != Addr, true))
+  if (!isFlatScratchBaseLegal(OrigAddr, OrigAddr != Addr, true))
     return false;
 
   if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 08f393ab9fae8d4..b8d82fc48d2b3bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,8 +155,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      SDValue FullAddr, bool CheckTwoInstrs = false,
-      bool CheckTwoOperands = false,
+      SDValue Addr, bool CheckTwoInstrs = false, bool CheckTwoOperands = false,
       uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a2972059de311c2..e3f22e50daac66d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4097,9 +4097,8 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
   std::tie(PtrBase, ConstOffset) =
       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
 
-  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
   if (ConstOffset == 0 ||
-      !isFlatScratchBaseLegal(*AddrDef->MI, nullptr, false, FlatVariant))
+      !isFlatScratchBaseLegal(Root.getReg(), false, false, FlatVariant))
     return Default;
 
   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4261,16 +4260,15 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   // Match the immediate offset first, which canonically is moved as low as
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
-  auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
 
-  if (ConstOffset != 0 && isFlatScratchBaseLegal(*AddrDef->MI) &&
+  if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
                             SIInstrFlags::FlatScratch)) {
     Addr = PtrBase;
     ImmOffset = ConstOffset;
-    AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   }
 
+  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = AddrDef->MI->getOperand(1).getIndex();
     return {{
@@ -4340,7 +4338,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   // possible.
   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
 
-  Register FullAddr = Addr;
+  Register OrigAddr = Addr;
   if (ConstOffset != 0 &&
       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
     Addr = PtrBase;
@@ -4358,9 +4356,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   Register LHS = AddrDef->MI->getOperand(1).getReg();
   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
 
-  auto FullAddrDef = getDefSrcRegIgnoringCopies(FullAddr, *MRI);
-  if (!isFlatScratchBaseLegal(*FullAddrDef->MI,
-                              FullAddr != Addr ? AddrDef->MI : nullptr, true))
+  if (!isFlatScratchBaseLegal(OrigAddr, OrigAddr != Addr, true))
     return std::nullopt;
 
   if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4496,45 +4492,59 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
 
 // Check that the address value of flat scratch load/store being put into
 // SGPR/VGPR is legal with respect to hardware's requirement that address in
-// SGPR/VGPR should be unsigned. When \p PartAddr is set, we will check
-// against both instructions to be sure the address are non-negative. When
-// \p CheckTwoOperands is set, we will check both operands (In case of two
-// instructions, they are the operands from the instruction \p PartAddr).
+// SGPR/VGPR should be unsigned. When \p CheckTwoInstrs is set, we will check
+// against the instruction that defines \p Addr as well as the instruction that
+// defines the base address. When \p CheckTwoOperands is set, we will check both
+// operands (In case of two instructions, they are the operands from the
+// instruction that defines the base address).
 bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
-    MachineInstr &FullAddr, MachineInstr *PartAddr, bool CheckTwoOperands,
+    Register Addr, bool CheckTwoInstrs, bool CheckTwoOperands,
     uint64_t FlatVariant) const {
   if (FlatVariant != SIInstrFlags::FlatScratch)
     return true;
 
-  // whether we can prove the operands are non-negative from operation.
+  // Whether we can infer the operands are non-negative if the result is
+  // non-negative.
   auto HasOnlyNonNegativeOperands = [](MachineInstr *Addr) -> bool {
     return Addr->getOpcode() == TargetOpcode::G_OR ||
            (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
             Addr->getFlag(MachineInstr::NoUWrap));
   };
 
-  if (PartAddr) {
-    if (HasOnlyNonNegativeOperands(&FullAddr) &&
-        HasOnlyNonNegativeOperands(PartAddr))
+  auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
+  auto *AddrMI = AddrDef->MI;
+  if (CheckTwoInstrs) {
+    Register Base = AddrMI->getOperand(1).getReg();
+    auto BaseDef = getDefSrcRegIgnoringCopies(Base, *MRI);
+    auto RHSOffset = getIConstantVRegValWithLookThrough(
+        AddrMI->getOperand(2).getReg(), *MRI);
+    assert(RHSOffset);
+
+    if (HasOnlyNonNegativeOperands(BaseDef->MI) &&
+        (HasOnlyNonNegativeOperands(AddrMI) ||
+         RHSOffset->Value.getSExtValue() < 0))
       return true;
-    Register LHS = PartAddr->getOperand(1).getReg();
-    Register RHS = PartAddr->getOperand(2).getReg();
+
+    Register LHS = BaseDef->MI->getOperand(1).getReg();
+    Register RHS = BaseDef->MI->getOperand(2).getReg();
     return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
   }
 
   // Single instruction case
-  if (HasOnlyNonNegativeOperands(&FullAddr))
+  if (HasOnlyNonNegativeOperands(AddrMI))
     return true;
 
-  Register LHS = FullAddr.getOperand(1).getReg();
-  Register RHS = FullAddr.getOperand(2).getReg();
+  Register LHS = AddrMI->getOperand(1).getReg();
+  Register RHS = AddrMI->getOperand(2).getReg();
   if (CheckTwoOperands)
     return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
 
-  if (FullAddr.getOpcode() == TargetOpcode::G_PTR_ADD) {
+  if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
     auto RhsValReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
     // If the immediate offset is negative, the base address cannot also be
-    // negative.
+    // negative. Although this is not strictly true in theory (in the case of
+    // wrap around), but given the range of legal immediate offset, this is true
+    // to get a final valid scratch address.
     if (RhsValReg && RhsValReg->Value.getSExtValue() < 0)
       return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 53e5fb995fc041e..7a5938860ce2b00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,8 +244,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
                         unsigned Size) const;
   bool isFlatScratchBaseLegal(
-      MachineInstr &FullAddr, MachineInstr *PartAddr = nullptr,
-      bool CheckTwoOperands = false,
+      Register Addr, bool CheckTwoInstrs = false, bool CheckTwoOperands = false,
       uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
 
   std::pair<Register, unsigned>

>From 3f6b11fbbfc84362c7d380e6e62b8b72e42ced10 Mon Sep 17 00:00:00 2001
From: Ruiling Song <ruiling.song at amd.com>
Date: Wed, 8 Nov 2023 11:51:21 +0800
Subject: [PATCH 7/7] Fix clang-format issue

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 189ad2a190becac..180011f4e050458 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1196,8 +1196,7 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr,
   // wrap around), but given the range of legal immediate offset, this is true
   // to get a final valid scratch address.
   ConstantSDNode *ImmOp = nullptr;
-  if (Addr.getOpcode() == ISD::ADD &&
-      (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
+  if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
     if (ImmOp->getSExtValue() < 0)
       return true;
   }



More information about the llvm-commits mailing list