[llvm] [AMDGPU] Fix bad removal of s_delay_alu (PR #145728)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 25 08:52:43 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Ana Mihajlovic (mihajlovicana)
<details>
<summary>Changes</summary>
instructionWaitsForSGPRWrites function covers ALL SALU instructions, including those like s_waitcnt that don't read from sgpr. This results in removing delay_alu instructions in cases like VALU->SGPR->VALU, which results in performance regression. Change modifies the function so that it checks if instruction also reads from the register.
---
Patch is 522.78 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145728.diff
55 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (+6-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+14-4)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+13)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+42-6)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+34-4)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+13)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll (+14-6)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+60-36)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+58-36)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+58-36)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+64-36)
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+11-2)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+152-36)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+58-36)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+58-36)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+64-36)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+20-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+64-16)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+15-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.mulo.ll (+9-4)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+6-2)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+22-11)
- (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+7)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+12-11)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+12-11)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+12-11)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+12-11)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 10efbab805676..2ff4c41991562 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -50,8 +50,12 @@ class AMDGPUInsertDelayAlu {
static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
// These instruction types wait for VA_SDST==0 before issuing.
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD;
-
- return MI.getDesc().TSFlags & VA_SDST_0;
+ if (MI.getDesc().TSFlags & VA_SDST_0) {
+ for (auto &Op : MI.uses())
+ if (Op.isReg())
+ return true;
+ }
+ return false;
}
// Types of delay that can be encoded in an s_delay_alu instruction.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index cbadd1eb431fc..788a4e6fb2141 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -3070,6 +3070,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
; GFX12-NEXT: flat_store_b32 v[0:1], v3
; GFX12-NEXT: s_endpgm
@@ -4161,6 +4162,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index dd37e855c4a3f..a224c8b391323 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -1025,11 +1025,12 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v2, v11
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2387,33 +2388,39 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
; GFX12-NEXT: v_mov_b32_e32 v20, v22
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
; GFX12-NEXT: s_wait_alu 0xfffd
; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
; GFX12-NEXT: v_mov_b32_e32 v19, v22
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
@@ -2434,6 +2441,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
@@ -2447,6 +2455,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
; GFX12-NEXT: s_wait_alu 0xf1ff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
@@ -2463,9 +2472,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 8319e112f526e..4b68f8a4bd194 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -240,6 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -453,6 +454,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -482,6 +484,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
@@ -514,6 +517,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -544,6 +548,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
@@ -882,6 +887,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -926,6 +932,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: s_wait_alu 0xf1ff
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1285,6 +1292,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1331,6 +1339,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_wait_loadcnt 0x0
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: s_wait_alu 0xf1ff
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1968,6 +1977,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_endpgm
@@ -2000,6 +2010,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_endpgm
@@ -2338,6 +2349,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu 0xf1ff
+; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -2383,6 +2395,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_wait_alu 0xf1ff
+; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index f979d01e495ba..389128012aef4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -223,6 +223,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -256,6 +257,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -291,6 +293,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -323,6 +326,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -933,6 +937,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1164_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1164_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1164_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -980,6 +985,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1132_ITERATIVE-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1132_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1132_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -1028,6 +1034,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1264_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1264_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1264_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1264_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -1075,6 +1082,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1232_ITERATIVE-NEXT: s_wait_kmcnt 0x0
; GFX1232_ITERATIVE-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232_ITERATIVE-NEXT: v_add_nc_u32_e32 v0, s2, v0
; GFX1232_ITERATIVE-NEXT: s_mov_b32 s2, -1
; GFX1232_ITERATIVE-NEXT: buffer_store_b32 v0, off, s[0:3], null
@@ -2151,11 +2159,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -2194,11 +2203,12 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s5, v2, v[1:2]
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -2236,7 +2246,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
@@ -2276,7 +2286,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s4, v2, s[2:3]
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
@@ -3390,6 +3400,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
+; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/145728
More information about the llvm-commits
mailing list