[llvm] [AMDGPU] Allow potentially negative flat scratch offsets on GFX12 (PR #78193)

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 15 09:03:43 PST 2024


https://github.com/jayfoad created https://github.com/llvm/llvm-project/pull/78193

https://github.com/llvm/llvm-project/pull/70634 has disabled use
of potentially negative scratch offsets, but we can use it on GFX12.


>From aefd5304bec43adc8d30db610aff3c91d01cf4d6 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 6 Dec 2023 14:27:59 -0800
Subject: [PATCH] [AMDGPU] Allow potentially negative flat scratch offsets on
 GFX12

https://github.com/llvm/llvm-project/pull/70634 has disabled use
of potentially negative scratch offsets, but we can use it on GFX12.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  10 ++
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  10 ++
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 105 +++++++++---------
 .../CodeGen/AMDGPU/flat-scratch-i8-i16.ll     |  54 ++++-----
 llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll  |  98 ++++++++--------
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      |  46 ++++----
 .../memory-legalizer-private-nontemporal.ll   |  22 ++--
 .../memory-legalizer-private-volatile.ll      |  22 ++--
 8 files changed, 179 insertions(+), 188 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e753b75dbbf492..54b9180d887761 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1184,6 +1184,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
   if (isNoUnsignedWrap(Addr))
     return true;
 
+  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+  // values.
+  if (AMDGPU::isGFX12Plus(*Subtarget))
+    return true;
+
   auto LHS = Addr.getOperand(0);
   auto RHS = Addr.getOperand(1);
   return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
@@ -1192,6 +1197,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
 // Check address value in SGPR/VGPR are legal for flat scratch in the form
 // of: SGPR + VGPR + Imm.
 bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
+  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+  // values.
+  if (AMDGPU::isGFX12Plus(*Subtarget))
+    return true;
+
   auto Base = Addr.getOperand(0);
   auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
   // If the immediate offset is negative and within certain range, the base
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 1d31c6b8fde93a..7f319d9bf678c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4586,6 +4586,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
   if (isNoUnsignedWrap(AddrMI))
     return true;
 
+  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+  // values.
+  if (AMDGPU::isGFX12Plus(STI))
+    return true;
+
   Register LHS = AddrMI->getOperand(1).getReg();
   Register RHS = AddrMI->getOperand(2).getReg();
   return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
@@ -4595,6 +4600,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
 // of: SGPR + VGPR + Imm.
 bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
     Register Addr) const {
+  // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+  // values.
+  if (AMDGPU::isGFX12Plus(STI))
+    return true;
+
   MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
   Register Base = AddrMI->getOperand(1).getReg();
   std::optional<DefinitionAndSourceRegister> BaseDef =
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 4603fbcd525c78..0bc836fd1e830c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -79,16 +79,17 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX12-NEXT:    s_and_b32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    s_add_co_i32 s0, s0, 4
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:4 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -170,8 +171,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:4 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, 4, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:128 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -248,14 +248,13 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-LABEL: store_load_vindex_foo:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, s32, v1
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -391,17 +390,19 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX12-NEXT:    s_and_b32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x104
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:260 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:260 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:260 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -490,13 +491,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:260 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, 0x104, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:384 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -589,16 +590,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-LABEL: store_load_vindex_small_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x100
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:256 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -697,17 +696,19 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-NEXT:    scratch_load_b32 v2, off, off offset:4 th:TH_LOAD_RT_NT
-; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    s_and_b32 s1, s0, 15
+; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX12-NEXT:    s_and_b32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT:    s_addk_co_i32 s0, 0x4004
-; GFX12-NEXT:    scratch_store_b32 off, v0, s0 th:TH_STORE_NT_RT
+; GFX12-NEXT:    v_mov_b32_e32 v2, s0
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16388 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16388 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16388 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -798,13 +799,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_sub_nc_u32_e32 v1, 0, v0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX12-NEXT:    scratch_load_b32 v3, off, off offset:4 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, off offset:16388 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v1
-; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:124 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v1, off offset:16512 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -899,16 +900,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-LABEL: store_load_vindex_large_offset_foo:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_add_nc_u32_e32 v1, s0, v1
-; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-NEXT:    scratch_store_b32 v1, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, s32 offset:16384 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -1154,11 +1153,9 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_add_nc_u32_e32 v0, 4, v0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index 93e8630dc7f560..e9d3b9aae653ae 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -894,8 +894,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
 ;
 ; GFX12-LABEL: test_scratch_load_i8_zext_svs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, s0 offset:1
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
@@ -931,8 +931,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
 ;
 ; GFX12-LABEL: test_scratch_load_i8_sext_svs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    scratch_load_i8 v0, v0, off offset:1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_i8 v0, v0, s0 offset:1
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
@@ -968,8 +968,8 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
 ;
 ; GFX12-LABEL: test_scratch_load_i16_zext_svs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    scratch_load_u16 v0, v0, off offset:2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_u16 v0, v0, s0 offset:2
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
@@ -1005,8 +1005,8 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
 ;
 ; GFX12-LABEL: test_scratch_load_i16_sext_svs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    scratch_load_i16 v0, v0, off offset:2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_i16 v0, v0, s0 offset:2
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v0
 ; GFX12-NEXT:    s_endpgm
@@ -1046,9 +1046,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5)
 ;
 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, off offset:1
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_d16_u8 v3, v0, s0 offset:1
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
@@ -1090,9 +1089,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5)
 ;
 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, off offset:1
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_d16_i8 v3, v0, s0 offset:1
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
@@ -1134,9 +1132,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inre
 ;
 ; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0xffff0000
-; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, off offset:2
+; GFX12-NEXT:    v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_d16_b16 v3, v0, s0 offset:2
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
@@ -1178,9 +1175,8 @@ define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5)
 ;
 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v3, -1
-; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, off offset:1
+; GFX12-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_d16_hi_u8 v3, v0, s0 offset:1
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
@@ -1222,9 +1218,8 @@ define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5)
 ;
 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v3, -1
-; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, off offset:1
+; GFX12-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_d16_hi_i8 v3, v0, s0 offset:1
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
@@ -1266,9 +1261,8 @@ define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inre
 ;
 ; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs:
 ; GFX12:       ; %bb.0: ; %bb
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
-; GFX12-NEXT:    v_mov_b32_e32 v3, -1
-; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, off offset:2
+; GFX12-NEXT:    v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-NEXT:    scratch_load_d16_hi_b16 v3, v0, s0 offset:2
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    flat_store_b32 v[1:2], v3
 ; GFX12-NEXT:    s_endpgm
@@ -1309,9 +1303,9 @@ define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrsp
 ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX12-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    scratch_store_d16_hi_b8 v1, v0, off offset:4
+; GFX12-NEXT:    scratch_store_d16_hi_b8 v1, v0, s0 offset:4
 ; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <4 x i8>, ptr %in
@@ -1350,9 +1344,9 @@ define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrs
 ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b32 v0, v[0:1]
-; GFX12-NEXT:    v_lshl_add_u32 v1, v2, 2, s0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX12-NEXT:    scratch_store_d16_hi_b16 v1, v0, off offset:2
+; GFX12-NEXT:    scratch_store_d16_hi_b16 v1, v0, s0 offset:2
 ; GFX12-NEXT:    s_endpgm
 bb:
   %load = load <2 x i16>, ptr %in
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 04eb6dcff4632b..bff88a77009c91 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -95,12 +95,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -221,13 +221,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -349,13 +348,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -480,12 +478,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -609,17 +607,17 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-SDAG-LABEL: soff2_voff2:
 ; GFX12-SDAG:       ; %bb.0: ; %bb
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -743,17 +741,17 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-SDAG-LABEL: soff2_voff4:
 ; GFX12-SDAG:       ; %bb.0: ; %bb
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -879,12 +877,12 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -1008,17 +1006,17 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-SDAG-LABEL: soff4_voff2:
 ; GFX12-SDAG:       ; %bb.0: ; %bb
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
@@ -1140,17 +1138,17 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-SDAG-LABEL: soff4_voff4:
 ; GFX12-SDAG:       ; %bb.0: ; %bb
 ; GFX12-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_add_co_i32 s0, s0, 4
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 th:TH_STORE_NT_RT
+; GFX12-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 th:TH_STORE_NT_RT
 ; GFX12-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 98379f5e3c68b4..f92a2d138e12ee 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -878,10 +878,10 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
@@ -945,10 +945,10 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
+; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 th:TH_STORE_NT_RT
 ; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 th:TH_LOAD_RT_NT
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -2097,12 +2097,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x100
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
@@ -2178,12 +2177,11 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX12-PAL-NEXT:    s_add_co_i32 s0, s32, 0x100
+; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 th:TH_STORE_NT_RT
 ; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 th:TH_LOAD_RT_NT
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -3328,12 +3326,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX12-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
@@ -3411,12 +3408,11 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX12-PAL-NEXT:    s_add_co_i32 s0, s32, 0x4000
+; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 th:TH_LOAD_RT_NT
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, off th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 th:TH_STORE_NT_RT
 ; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 th:TH_LOAD_RT_NT
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -3797,13 +3793,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-LABEL: store_load_vidx_sidx_offset:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
+; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT
 ; GFX12-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
+; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT
 ; GFX12-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-NEXT:    s_endpgm
 ;
@@ -3879,13 +3874,12 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX12-PAL-LABEL: store_load_vidx_sidx_offset:
 ; GFX12-PAL:       ; %bb.0: ; %bb
 ; GFX12-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s0, v0
-; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
-; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 th:TH_STORE_NT_RT
+; GFX12-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
+; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1028 th:TH_STORE_NT_RT
 ; GFX12-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 th:TH_LOAD_RT_NT
+; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 th:TH_LOAD_RT_NT
 ; GFX12-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-PAL-NEXT:    s_endpgm
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 047cb3ab400084..ca029639923bd2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -394,13 +394,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX12-WGP-LABEL: private_nontemporal_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT_HT
 ; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-WGP-NEXT:    s_nop 0
@@ -409,13 +407,11 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX12-CU-LABEL: private_nontemporal_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_NT_HT
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_NT_HT
 ; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-CU-NEXT:    s_nop 0
@@ -794,10 +790,9 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_WB
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
 ; GFX12-CU-LABEL: private_nontemporal_store_1:
@@ -805,10 +800,9 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_WB
+; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_WB
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 4b1fb295adec2a..feeff499458ead 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -284,13 +284,11 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX12-WGP-LABEL: private_volatile_load_1:
 ; GFX12-WGP:       ; %bb.0: ; %entry
-; GFX12-WGP-NEXT:    s_clause 0x1
 ; GFX12-WGP-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-WGP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT
+; GFX12-WGP-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_RT_NT
 ; GFX12-WGP-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-WGP-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-WGP-NEXT:    s_nop 0
@@ -299,13 +297,11 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX12-CU-LABEL: private_volatile_load_1:
 ; GFX12-CU:       ; %bb.0: ; %entry
-; GFX12-CU-NEXT:    s_clause 0x1
 ; GFX12-CU-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GFX12-CU-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX12-CU-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
-; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, off th:TH_LOAD_RT_NT
+; GFX12-CU-NEXT:    scratch_load_b32 v0, v0, s2 th:TH_LOAD_RT_NT
 ; GFX12-CU-NEXT:    s_waitcnt vmcnt(0)
 ; GFX12-CU-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-CU-NEXT:    s_nop 0
@@ -594,10 +590,9 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX12-WGP-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-WGP-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-WGP-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX12-WGP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-WGP-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT
+; GFX12-WGP-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-WGP-NEXT:    scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_RT
 ; GFX12-WGP-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-WGP-NEXT:    s_endpgm
 ;
@@ -606,10 +601,9 @@ define amdgpu_kernel void @private_volatile_store_1(
 ; GFX12-CU-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX12-CU-NEXT:    s_load_b32 s0, s[0:1], 0x0
-; GFX12-CU-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX12-CU-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX12-CU-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, off th:TH_STORE_NT_RT
+; GFX12-CU-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX12-CU-NEXT:    scratch_store_b32 v0, v1, s2 th:TH_STORE_NT_RT
 ; GFX12-CU-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX12-CU-NEXT:    s_endpgm
     ptr addrspace(1) %in, ptr addrspace(5) %out) {



More information about the llvm-commits mailing list