[llvm] 91aac7c - AMDGPU: Handle s_add_u32 in eliminateFrameIndex (#129628)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 4 17:09:49 PST 2025


Author: Matt Arsenault
Date: 2025-03-05T08:09:46+07:00
New Revision: 91aac7c379fb92348593d51e3f2d9e490ff67526

URL: https://github.com/llvm/llvm-project/commit/91aac7c379fb92348593d51e3f2d9e490ff67526
DIFF: https://github.com/llvm/llvm-project/commit/91aac7c379fb92348593d51e3f2d9e490ff67526.diff

LOG: AMDGPU: Handle s_add_u32 in eliminateFrameIndex (#129628)

We can fold frame indexes directly into existing immediate operands,
just like is already done for s_add_i32. We happen to use s_add_i32 in
the 32-bit add case, but s_add_u32 appears in the a 64-bit add sequence
of a flat pointer if an addrpacecast source is a frame index.

This avoids, but does not address a failure exposed after
a3165398db0736588daedb07650195502592e567 where two literal operands
end up in the final instruction. The underlying issue still exists for
some instructions without special handling in eliminateFrameIndex.

Added: 
    llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
    llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/test/CodeGen/AMDGPU/frame-index.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 7d6990c097774..128cd8244a477 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2713,7 +2713,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
       return true;
     }
-    case AMDGPU::S_ADD_I32: {
+    case AMDGPU::S_ADD_I32:
+    case AMDGPU::S_ADD_U32: {
       // TODO: Handle s_or_b32, s_and_b32.
       unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
       MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
@@ -2773,7 +2774,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
           DstReg = TmpReg;
         }
 
-        auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
+        auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
                           .addDef(DstReg, RegState::Renamable)
                           .addReg(MaterializedReg, RegState::Kill)
                           .add(OtherOp);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c3b48b5d2ddff..378c6312c52be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -142,13 +142,12 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN-NEXT:    v_mov_b32_e32 v0, s48
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:240
 ; GCN-NEXT:    v_mov_b32_e32 v0, s49
-; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:244
 ; GCN-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NEXT:    s_lshl_b32 s4, s4, 2
+; GCN-NEXT:    s_and_b32 s4, s25, 63
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:248
 ; GCN-NEXT:    v_mov_b32_e32 v0, s51
-; GCN-NEXT:    s_add_u32 s4, 0, s4
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:252
 ; GCN-NEXT:    v_mov_b32_e32 v0, s24
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4

diff  --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir
new file mode 100644
index 0000000000000..af61bd70f16b6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-s-add-u32.mir
@@ -0,0 +1,123 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=MUBUFW32 %s
+
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW64 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=FLATSCRW32 %s
+
+---
+name: s_add_u32__inline_imm__fi_offset0
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 32, alignment: 16 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; MUBUFW64-LABEL: name: s_add_u32__inline_imm__fi_offset0
+    ; MUBUFW64: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_U32 12, $sgpr4, implicit-def dead $scc
+    ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7
+    ;
+    ; MUBUFW32-LABEL: name: s_add_u32__inline_imm__fi_offset0
+    ; MUBUFW32: renamable $sgpr4 = S_LSHR_B32 $sgpr32, 5, implicit-def dead $scc
+    ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_U32 12, $sgpr4, implicit-def dead $scc
+    ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7
+    ;
+    ; FLATSCRW64-LABEL: name: s_add_u32__inline_imm__fi_offset0
+    ; FLATSCRW64: renamable $sgpr7 = S_ADD_U32 12, $sgpr32, implicit-def dead $scc
+    ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7
+    ;
+    ; FLATSCRW32-LABEL: name: s_add_u32__inline_imm__fi_offset0
+    ; FLATSCRW32: renamable $sgpr7 = S_ADD_U32 12, $sgpr32, implicit-def dead $scc
+    ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7
+    renamable $sgpr7 = S_ADD_U32 12, %stack.0, implicit-def dead $scc
+    SI_RETURN implicit $sgpr7
+
+...
+
+---
+name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 96, alignment: 16 }
+  - { id: 1, size: 128, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; MUBUFW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+    ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW64-NEXT: {{  $}}
+    ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW64-NEXT: renamable $sgpr7 = S_MOV_B32 164
+    ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7
+    ;
+    ; MUBUFW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+    ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW32-NEXT: {{  $}}
+    ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW32-NEXT: renamable $sgpr7 = S_MOV_B32 164
+    ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7
+    ;
+    ; FLATSCRW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+    ; FLATSCRW64: renamable $sgpr7 = S_MOV_B32 164
+    ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7
+    ;
+    ; FLATSCRW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal
+    ; FLATSCRW32: renamable $sgpr7 = S_MOV_B32 164
+    ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7
+    renamable $sgpr7 = S_ADD_U32 68, %stack.1, implicit-def dead $scc
+    SI_RETURN implicit $sgpr7
+...
+
+---
+name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 96, alignment: 16 }
+  - { id: 1, size: 128, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  isEntryFunction: true
+body:             |
+  bb.0:
+    ; MUBUFW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+    ; MUBUFW64: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW64-NEXT: {{  $}}
+    ; MUBUFW64-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW64-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW64-NEXT: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+    ; MUBUFW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+    ;
+    ; MUBUFW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+    ; MUBUFW32: liveins: $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW32-NEXT: {{  $}}
+    ; MUBUFW32-NEXT: $sgpr0 = S_ADD_U32 $sgpr0, $noreg, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW32-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
+    ; MUBUFW32-NEXT: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+    ; MUBUFW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+    ;
+    ; FLATSCRW64-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+    ; FLATSCRW64: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+    ; FLATSCRW64-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+    ;
+    ; FLATSCRW32-LABEL: name: s_add_u32__kernel__literal__fi_offset96__offset_literal_live_scc
+    ; FLATSCRW32: renamable $sgpr7 = S_ADD_U32 164, 0, implicit-def $scc
+    ; FLATSCRW32-NEXT: SI_RETURN implicit $sgpr7, implicit $scc
+    renamable $sgpr7 = S_ADD_U32 68, %stack.1, implicit-def $scc
+    SI_RETURN implicit $sgpr7, implicit $scc
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 346b69c362c04..96d0e383761d1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -38,7 +38,6 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -76,12 +75,9 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, 4, v0
@@ -113,8 +109,7 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -168,7 +163,6 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
@@ -207,11 +201,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -246,11 +238,9 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -304,7 +294,6 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
@@ -343,11 +332,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -382,11 +369,9 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -440,7 +425,6 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -483,8 +467,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -520,8 +503,7 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -576,7 +558,6 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
@@ -616,11 +597,10 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -657,11 +637,10 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
@@ -717,7 +696,6 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
@@ -757,11 +735,10 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -798,11 +775,10 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
@@ -857,7 +833,6 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 2, v0
@@ -900,8 +875,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_add_nc_u32 v5, 2, v0
@@ -937,8 +911,7 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -993,7 +966,6 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
@@ -1033,11 +1005,10 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 1, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1074,11 +1045,10 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
@@ -1133,7 +1103,6 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX942-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 1, v0
 ; GFX942-GISEL-NEXT:    scratch_store_byte v2, v1, off sc0 sc1
@@ -1173,11 +1142,10 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1214,11 +1182,10 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS
@@ -1263,7 +1230,6 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
 ; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_add_u32 s0, 0, s0
 ; GFX942-GISEL-NEXT:    v_add3_u32 v0, s0, v0, -1
 ; GFX942-GISEL-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -1285,8 +1251,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
 ; GFX11-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_add_u32 s0, 0, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX11-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1306,8 +1271,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) {
 ; GFX12-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, 0, s0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
 ; GFX12-GISEL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
index b1ea275a97a39..004403f46a4d4 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -346,4 +346,32 @@ entry:
   ret void
 }
 
+; Check for "SOP2/SOPC instruction requires too many immediate
+; constants" verifier error.  Frame index would fold into low half of
+; the lowered flat pointer add, and use s_add_u32 instead of
+; s_add_i32.
+
+; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error:
+; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010
+; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0
+define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 {
+entry:
+  %.omp.reduction.element.i.i.i.i = alloca [1024 x i32], align 4, addrspace(5)
+  %Total3.i.i = alloca [1024 x i32], align 16, addrspace(5)
+  %Total3.ascast.i.i = addrspacecast ptr addrspace(5) %Total3.i.i to ptr
+  %gep = getelementptr i8, ptr %Total3.ascast.i.i, i64 4096
+  %p2i = ptrtoint ptr %gep to i64
+  br label %.shuffle.then.i.i.i.i
+
+.shuffle.then.i.i.i.i:                            ; preds = %.shuffle.then.i.i.i.i, %entry
+  store i64 0, ptr addrspace(5) null, align 4
+  %icmp = icmp ugt i64 %p2i, 1
+  br i1 %icmp, label %.shuffle.then.i.i.i.i, label %vector.body.i.i.i.i
+
+vector.body.i.i.i.i:                              ; preds = %.shuffle.then.i.i.i.i
+  %wide.load9.i.i.i.i = load <2 x i32>, ptr addrspace(5) %.omp.reduction.element.i.i.i.i, align 4
+  store <2 x i32> %wide.load9.i.i.i.i, ptr addrspace(5) null, align 4
+  ret void
+}
+
 attributes #0 = { nounwind }

diff  --git a/llvm/test/CodeGen/AMDGPU/frame-index.mir b/llvm/test/CodeGen/AMDGPU/frame-index.mir
index 5815d2bfe7bf8..81bd8baaa0e5d 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/frame-index.mir
@@ -91,8 +91,8 @@ body:             |
     ; GCN-LABEL: name: func_add_constant_to_fi_uniform_SCC_clobber_i32
     ; GCN: liveins: $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
-    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 killed $sgpr0, 4, implicit-def $scc
+    ; GCN-NEXT: renamable $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def dead $scc
+    ; GCN-NEXT: renamable $sgpr4 = nuw S_ADD_U32 $sgpr0, 4, implicit-def $scc
     ; GCN-NEXT: renamable $sgpr5 = S_ADDC_U32 $sgpr4, 1234567, implicit-def $scc, implicit $scc
     ; GCN-NEXT: $sgpr0 = S_LSHR_B32 $sgpr32, 6, implicit-def $scc
     ; GCN-NEXT: $sgpr0 = S_ADD_I32 killed $sgpr0, 8, implicit-def $scc


        


More information about the llvm-commits mailing list