[llvm] 86815a1 - AMDGPU/GlobalISel: Permit mapping G_FRAME_INDEX to sgprs (#101325)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jul 31 22:42:07 PDT 2024


Author: Matt Arsenault
Date: 2024-08-01T09:42:04+04:00
New Revision: 86815a1842d308521f46048bb9ed08e47c0d8357

URL: https://github.com/llvm/llvm-project/commit/86815a1842d308521f46048bb9ed08e47c0d8357
DIFF: https://github.com/llvm/llvm-project/commit/86815a1842d308521f46048bb9ed08e47c0d8357.diff

LOG: AMDGPU/GlobalISel: Permit mapping G_FRAME_INDEX to sgprs (#101325)

eliminateFrameIndex should now properly handle materializing
frame indices in SGPRs, so treat this like the other constant
operand types.

On average this will produce worse code; we need to detect
VGPR uses, and improve SGPR->VGPR frame index folds.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
    llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8da8c94b4d665..9a6ba5ac68084 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4060,6 +4060,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FCONSTANT:
   case AMDGPU::G_CONSTANT:
   case AMDGPU::G_GLOBAL_VALUE:
+  case AMDGPU::G_FRAME_INDEX:
   case AMDGPU::G_BLOCK_ADDR:
   case AMDGPU::G_READSTEADYCOUNTER:
   case AMDGPU::G_READCYCLECOUNTER: {
@@ -4067,13 +4068,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
     break;
   }
-  case AMDGPU::G_FRAME_INDEX: {
-    // TODO: This should be the same as other constants, but eliminateFrameIndex
-    // currently assumes VALU uses.
-    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
-    break;
-  }
   case AMDGPU::G_DYN_STACKALLOC: {
     // Result is always uniform, and a wave reduction is needed for the source.
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
index 48916d8d9b2c5..84378bcb70684 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/crash-stack-address-O0.ll
@@ -10,9 +10,11 @@ define amdgpu_kernel void @stack_write_fi() {
 ; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b32 s5, 0
+; CHECK-NEXT:    s_mov_b32 s6, 0
 ; CHECK-NEXT:    s_mov_b32 s4, 0
-; CHECK-NEXT:    v_mov_b32_e32 v0, s5
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s4
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
index b4b95fdab4ab2..4fdb4082346af 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.gfx.ll
@@ -10,11 +10,13 @@ define amdgpu_ps void @amdgpu_ps() {
 ; MESA-LABEL: amdgpu_ps:
 ; MESA:       ; %bb.0:
 ; MESA-NEXT:    s_add_u32 flat_scratch_lo, s2, s4
-; MESA-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; MESA-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; MESA-NEXT:    v_mov_b32_e32 v0, 0
-; MESA-NEXT:    v_mov_b32_e32 v1, s1
+; MESA-NEXT:    s_mov_b32 s0, 0
+; MESA-NEXT:    s_mov_b64 s[2:3], src_private_base
+; MESA-NEXT:    s_mov_b32 s1, s3
+; MESA-NEXT:    v_mov_b32_e32 v0, s0
 ; MESA-NEXT:    v_mov_b32_e32 v2, 0
+; MESA-NEXT:    v_mov_b32_e32 v1, s1
 ; MESA-NEXT:    flat_store_dword v[0:1], v2
 ; MESA-NEXT:    s_waitcnt vmcnt(0)
 ; MESA-NEXT:    s_endpgm
@@ -24,13 +26,15 @@ define amdgpu_ps void @amdgpu_ps() {
 ; PAL-NEXT:    s_getpc_b64 s[2:3]
 ; PAL-NEXT:    s_mov_b32 s2, s0
 ; PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; PAL-NEXT:    v_mov_b32_e32 v0, 0
 ; PAL-NEXT:    v_mov_b32_e32 v2, 0
 ; PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; PAL-NEXT:    s_and_b32 s3, s3, 0xffff
 ; PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
-; PAL-NEXT:    s_mov_b64 s[0:1], src_private_base
 ; PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; PAL-NEXT:    s_mov_b32 s0, 0
+; PAL-NEXT:    s_mov_b64 s[2:3], src_private_base
+; PAL-NEXT:    s_mov_b32 s1, s3
+; PAL-NEXT:    v_mov_b32_e32 v0, s0
 ; PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; PAL-NEXT:    flat_store_dword v[0:1], v2
 ; PAL-NEXT:    s_waitcnt vmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index a5e4151bf3695..f4fd803c8dda8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -55,41 +55,40 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT:    s_add_i32 s0, s0, 0
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_add_i32 s0, s0, 0
+; GFX11-NEXT:    s_add_i32 s1, s1, 0
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX12-NEXT:    s_and_b32 s0, s0, 15
-; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 0
+; GFX12-NEXT:    s_add_co_i32 s1, s1, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -378,44 +377,44 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:256 sc0 sc1
+; GFX940-NEXT:    s_addk_i32 s0, 0x100
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT:    scratch_load_b32 v2, off, off glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_addk_i32 s0, 0x100
+; GFX11-NEXT:    s_addk_i32 s1, 0x100
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:256 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_small_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX12-NEXT:    s_and_b32 s0, s0, 15
-; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:256 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -692,46 +691,44 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, s0
-; GFX940-NEXT:    s_movk_i32 s0, 0x4004
-; GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
+; GFX940-NEXT:    s_addk_i32 s0, 0x4004
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX11-NEXT:    scratch_load_b32 v2, off, off offset:4 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    s_addk_i32 s0, 0x4004
+; GFX11-NEXT:    s_addk_i32 s1, 0x4004
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_movk_i32 s0, 0x4004
-; GFX11-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: store_load_sindex_large_offset_kernel:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_load_b32 s0, s[2:3], 0x24
-; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_lshl_b32 s1, s0, 2
-; GFX12-NEXT:    s_and_b32 s0, s0, 15
-; GFX12-NEXT:    v_mov_b32_e32 v0, s1
+; GFX12-NEXT:    s_and_b32 s1, s0, 15
 ; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, s0
-; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:16384 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_endpgm
 bb:
@@ -995,25 +992,28 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
+; GFX940-NEXT:    s_movk_i32 s0, 0x3e80
 ; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3e80
-; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; GFX940-NEXT:    s_add_i32 s0, s0, 4
+; GFX940-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
 ; GFX11:       ; %bb.0: ; %bb
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
-; GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX11-NEXT:    s_movk_i32 s0, 0x3e80
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1075,26 +1075,31 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940:       ; %bb.0: ; %bb
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    v_mov_b32_e32 v0, 13
+; GFX940-NEXT:    s_movk_i32 s0, 0x3e80
+; GFX940-NEXT:    s_add_i32 s1, s32, 4
 ; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3e80
-; GFX940-NEXT:    v_mov_b32_e32 v1, 15
-; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 15
+; GFX940-NEXT:    s_add_i32 s0, s0, s1
+; GFX940-NEXT:    scratch_store_dword off, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
+; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
-; GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX11-NEXT:    s_movk_i32 s0, 0x3e80
+; GFX11-NEXT:    s_add_i32 s1, s32, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, s1
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:4 dlc
+; GFX11-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:4 glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, off, s0 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index db944b98a3013..4fcde0f2fc7cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -11,12 +11,11 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x10
 ; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_load_dwordx16 s[36:51], s[22:23], 0x0
 ; GCN-NEXT:    s_load_dwordx16 s[52:67], s[22:23], 0x40
 ; GCN-NEXT:    s_load_dwordx16 s[4:19], s[22:23], 0x80
-; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s36
 ; GCN-NEXT:    v_mov_b32_e32 v1, s37
@@ -143,16 +142,17 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:240
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
+; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:244
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NEXT:    s_and_b32 s4, s25, 63
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:248
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
-; GCN-NEXT:    s_lshl_b32 s4, s4, 2
+; GCN-NEXT:    s_add_u32 s4, 0, s4
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:252
-; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
-; GCN-NEXT:    v_mov_b32_e32 v1, s24
-; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0
 ; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
index 14f69c301ec38..76994c5cccf5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-frame-index.mir
@@ -2,22 +2,45 @@
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
---- |
-  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-  define void @test_frame_index_p5() {
-      %ptr0 = alloca i32, addrspace(5)
-     ret void
-    }
-...
 ---
 name: test_frame_index_p5
 legalized:       true
 stack:
-  - { id: 0, name: ptr0, offset: 0, size: 4, alignment: 4 }
+  - { id: 0, offset: 0, size: 4, alignment: 4 }
 body: |
   bb.0:
     ; CHECK-LABEL: name: test_frame_index_p5
-    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:vgpr(p5) = G_FRAME_INDEX %stack.0.ptr0
-    %0:_(p5) = G_FRAME_INDEX %stack.0.ptr0
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+    %0:_(p5) = G_FRAME_INDEX %stack.0
+
+...
+
+---
+name: test_frame_index_p5_sgpr_use
+legalized:       true
+stack:
+  - { id: 0, offset: 0, size: 4, alignment: 4 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_frame_index_p5_sgpr_use
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: $sgpr0 = COPY [[FRAME_INDEX]](p5)
+    %0:_(p5) = G_FRAME_INDEX %stack.0
+    $sgpr0 = COPY %0
+
+...
+
+---
+name: test_frame_index_p5_vgpr_use
+legalized:       true
+stack:
+  - { id: 0, offset: 0, size: 4, alignment: 4 }
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_frame_index_p5_vgpr_use
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:sgpr(p5) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: $vgpr0 = COPY [[FRAME_INDEX]](p5)
+    %0:_(p5) = G_FRAME_INDEX %stack.0
+    $vgpr0 = COPY %0
 
 ...

diff  --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
index e2b4865410db8..3216e71e6221a 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll
@@ -96,16 +96,29 @@ define void @private_alloca_to_flat(ptr %ptr) {
 ; OPT-NEXT:    store volatile i32 7, ptr [[TMP1]], align 4
 ; OPT-NEXT:    ret void
 ;
-; ASM-LABEL: private_alloca_to_flat:
-; ASM:       ; %bb.0:
-; ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
-; ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
-; ASM-NEXT:    v_mov_b32_e32 v1, s5
-; ASM-NEXT:    v_mov_b32_e32 v2, 7
-; ASM-NEXT:    flat_store_dword v[0:1], v2
-; ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; ASM-NEXT:    s_setpc_b64 s[30:31]
+; DAGISEL-ASM-LABEL: private_alloca_to_flat:
+; DAGISEL-ASM:       ; %bb.0:
+; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT:    s_mov_b64 s[4:5], src_private_base
+; DAGISEL-ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
+; DAGISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
+; DAGISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
+; DAGISEL-ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; DAGISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-ASM-LABEL: private_alloca_to_flat:
+; GISEL-ASM:       ; %bb.0:
+; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT:    s_lshr_b32 s4, s32, 6
+; GISEL-ASM-NEXT:    s_mov_b64 s[6:7], src_private_base
+; GISEL-ASM-NEXT:    s_mov_b32 s5, s7
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v2, 7
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-ASM-NEXT:    flat_store_dword v[0:1], v2
+; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-ASM-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i8, addrspace(5)
   %x = addrspacecast ptr addrspace(5) %alloca to ptr
   store volatile i32 7, ptr %x
@@ -224,8 +237,9 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) {
 ; GISEL-ASM-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GISEL-ASM-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-ASM-NEXT:    s_lshr_b32 s6, s32, 6
 ; GISEL-ASM-NEXT:    s_xor_b64 s[4:5], vcc, -1
-; GISEL-ASM-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
+; GISEL-ASM-NEXT:    v_mov_b32_e32 v0, s6
 ; GISEL-ASM-NEXT:    s_and_saveexec_b64 s[6:7], vcc
 ; GISEL-ASM-NEXT:  ; %bb.1: ; %then
 ; GISEL-ASM-NEXT:    v_and_b32_e32 v0, 0xffff, v1

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 087d38ce7b004..89da9b8e75bc9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -37,17 +37,17 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff1_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
@@ -78,12 +78,14 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -114,8 +116,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -166,13 +169,13 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -208,10 +211,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -246,10 +251,12 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -300,13 +307,13 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -342,10 +349,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -380,10 +389,12 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -434,18 +445,18 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff2_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
@@ -478,14 +489,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -519,8 +531,9 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -571,14 +584,14 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -615,12 +628,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -656,12 +670,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -712,14 +727,14 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -756,12 +771,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -797,12 +813,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -853,18 +870,18 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX940-GISEL-LABEL: soff4_voff1:
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 2
+; GFX940-GISEL-NEXT:    scratch_store_byte v3, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, 4, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 4
@@ -897,14 +914,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v4, v1, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -938,8 +956,9 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -990,14 +1009,14 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -1034,12 +1053,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -1075,12 +1095,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
@@ -1130,14 +1151,14 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX940-GISEL:       ; %bb.0: ; %bb
 ; GFX940-GISEL-NEXT:    s_load_dword s0, s[2:3], 0x24
 ; GFX940-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX940-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 1
+; GFX940-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX940-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX940-GISEL-NEXT:    v_add3_u32 v0, v1, s0, v0
-; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 1, v0
-; GFX940-GISEL-NEXT:    scratch_store_byte v1, v2, off sc0 sc1
+; GFX940-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX940-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX940-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
 ; GFX940-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-GISEL-NEXT:    v_add_u32_e32 v1, 2, v0
 ; GFX940-GISEL-NEXT:    v_mov_b32_e32 v2, 2
@@ -1174,12 +1195,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
@@ -1215,12 +1237,13 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[2:3], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_add3_u32 v0, 0, s0, v0
+; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS


        


More information about the llvm-commits mailing list