[llvm] [AMDGPU] Change the immediate operand of s_waitcnt_depctr / s_wait_alu (PR #169378)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 24 09:52:56 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
The 16-bit immediate operand of s_waitcnt_depctr / s_wait_alu has some
unused bits. Previously codegen would set these bits to 1, but setting
them to 0 matches the SP3 assembler behaviour better, which in turn
means that we can print them using the human readable SP3 syntax:
s_wait_alu 0xfffd ; unused bits set to 1
s_wait_alu 0xff9d ; unused bits set to 0
s_wait_alu depctr_va_vcc(0) ; unused bits set to 0, human readable
Note that the set of unused bits changed between GFX10.1 and GFX10.3.
---
Patch is 3.81 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169378.diff
220 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp (+8-7)
- (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+11-10)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+21-14)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fabs.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+42-42)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll (+54-54)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f64.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fneg.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll (+36-36)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr-new-regbank-select.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.rsq.clamp.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/load-zero-and-sign-extending-uniform-in-vgpr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sub.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/add_i1.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll (+44-44)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn-cs-chain-intrinsic-dyn-vgpr-w32.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-branch-weight-metadata.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+61-61)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+191-191)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+162-162)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+51-51)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+53-53)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+63-63)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+107-107)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+132-132)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+132-132)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/cc-entry.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/code-size-estimate.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/cse-convergent.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/fcmp.f16.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+80-80)
- (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+175-175)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+276-276)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+276-276)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+287-287)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/fma.f16.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll (+65-65)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll (+65-65)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/freeze-binary.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+244-244)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+516-516)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll (+22-22)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+136-136)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+220-220)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+220-220)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+248-248)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (+128-128)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll (+49-49)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+109-109)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/idot2.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/idot8s.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/idot8u.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+25-25)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/lds-direct-hazards-gfx11.mir (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dead.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll (+51-51)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (+64-64)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.add.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.sub.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.v3f16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll (+60-60)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+122-122)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+122-122)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+86-86)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.mulo.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.powi.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+65-65)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+31-31)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+31-31)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+94-94)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+100-100)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+100-100)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+120-120)
- (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/mad_64_32.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/madak.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll (+45-45)
- (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+838-838)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll (+118-118)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll (+118-118)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll (+120-120)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll (+110-110)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll (+120-120)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll (+120-120)
- (modified) llvm/test/CodeGen/AMDGPU/merge-consecutive-wait-alus.mir (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+838-838)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+54-54)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-global.ll (+54-54)
- (modified) llvm/test/CodeGen/AMDGPU/partial-forwarding-hazards.mir (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+17-17)
- (modified) llvm/test/CodeGen/AMDGPU/s-barrier.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll (+90-90)
- (modified) llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/skip-if-dead.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr-block.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sub.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/sub.v2i16.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/sub_i1.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/trans-forwarding-hazards.mir (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/v_cndmask.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/v_swap_b16.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard-true16.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/valu-mask-write-hazard.mir (+55-55)
- (modified) llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard-attrs.mir (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/valu-read-sgpr-hazard.mir (+43-43)
- (modified) llvm/test/CodeGen/AMDGPU/vcmpx-exec-war-hazard.mir (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll (+53-53)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+52-52)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+52-52)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll (+48-48)
- (modified) llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll (+48-48)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll (+80-80)
- (modified) llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll (+29-29)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index ded2f5ae1f8af..8d8386c3a8a7c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -44,6 +44,7 @@ namespace {
class AMDGPUWaitSGPRHazards {
public:
+ const GCNSubtarget *ST;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
const MachineRegisterInfo *MRI;
@@ -165,7 +166,7 @@ class AMDGPUWaitSGPRHazards {
}
unsigned mergeMasks(unsigned Mask1, unsigned Mask2) {
- unsigned Mask = 0xffff;
+ unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
Mask = AMDGPU::DepCtr::encodeFieldSaSdst(
Mask, std::min(AMDGPU::DepCtr::decodeFieldSaSdst(Mask1),
AMDGPU::DepCtr::decodeFieldSaSdst(Mask2)));
@@ -387,7 +388,7 @@ class AMDGPUWaitSGPRHazards {
// Apply wait
if (Wait) {
- unsigned Mask = 0xffff;
+ unsigned Mask = AMDGPU::DepCtr::getDefaultDepCtrEncoding(*ST);
if (Wait & WA_VCC) {
State.VCCHazard &= ~HazardState::VALU;
Mask = AMDGPU::DepCtr::encodeFieldVaVcc(Mask, 0);
@@ -438,8 +439,8 @@ class AMDGPUWaitSGPRHazards {
}
bool run(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (!ST.hasVALUReadSGPRHazard())
+ ST = &MF.getSubtarget<GCNSubtarget>();
+ if (!ST->hasVALUReadSGPRHazard())
return false;
// Parse settings
@@ -467,10 +468,10 @@ class AMDGPUWaitSGPRHazards {
if (!EnableSGPRHazardWaits)
return false;
- TII = ST.getInstrInfo();
- TRI = ST.getRegisterInfo();
+ TII = ST->getInstrInfo();
+ TRI = ST->getRegisterInfo();
MRI = &MF.getRegInfo();
- DsNopCount = ST.isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
+ DsNopCount = ST->isWave64() ? WAVE64_NOPS : WAVE32_NOPS;
auto CallingConv = MF.getFunction().getCallingConv();
if (!AMDGPU::isEntryFunctionCC(CallingConv) &&
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 29d22f27a2d8e..7fbf520c670ae 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1354,7 +1354,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
return true;
}
@@ -1487,7 +1487,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
return true;
}
@@ -1651,7 +1651,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
} else {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0, ST));
}
return true;
@@ -1809,7 +1809,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
return true;
}
@@ -1895,7 +1895,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
// avoided.
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0, ST));
return true;
}
@@ -3404,7 +3404,8 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
};
const unsigned ConstantMaskBits = AMDGPU::DepCtr::encodeFieldSaSdst(
- AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0), 0),
+ AMDGPU::DepCtr::encodeFieldVaSdst(AMDGPU::DepCtr::encodeFieldVaVcc(0, ST),
+ 0),
0);
auto UpdateStateFn = [&](StateType &State, const MachineInstr &I) {
switch (I.getOpcode()) {
@@ -3456,9 +3457,9 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Compute counter mask
unsigned DepCtr =
- IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0)
- : AMDGPU::DepCtr::encodeFieldVaSdst(0))
- : AMDGPU::DepCtr::encodeFieldSaSdst(0);
+ IsVALU ? (IsVCC(HazardReg) ? AMDGPU::DepCtr::encodeFieldVaVcc(0, ST)
+ : AMDGPU::DepCtr::encodeFieldVaSdst(0, ST))
+ : AMDGPU::DepCtr::encodeFieldSaSdst(0, ST);
// Try to merge previous waits into this one for regions with no SGPR reads.
if (!WaitInstrs.empty()) {
@@ -3723,7 +3724,7 @@ bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
.addImm(AMDGPU::DepCtr::encodeFieldVaSdst(
- AMDGPU::DepCtr::encodeFieldSaSdst(0), 0));
+ AMDGPU::DepCtr::encodeFieldSaSdst(0, ST), 0));
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index a7333e3373f38..4cca01690d534 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2934,7 +2934,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
auto ApplyHazardWorkarounds = [this, &MBB, &I, &DL, FlushSGPRWrites]() {
if (FlushSGPRWrites)
BuildMI(MBB, I, DL, get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0, ST));
};
// We need to compute the offset relative to the instruction immediately after
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 998a2b5b36a87..90f0b49ab9a78 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2052,56 +2052,63 @@ unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
}
-unsigned encodeFieldVmVsrc(unsigned VmVsrc) {
- return encodeFieldVmVsrc(0xffff, VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVmVsrc(Encoded, VmVsrc);
}
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) {
return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
}
-unsigned encodeFieldVaVdst(unsigned VaVdst) {
- return encodeFieldVaVdst(0xffff, VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaVdst(Encoded, VaVdst);
}
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) {
return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
}
-unsigned encodeFieldSaSdst(unsigned SaSdst) {
- return encodeFieldSaSdst(0xffff, SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldSaSdst(Encoded, SaSdst);
}
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst) {
return packBits(VaSdst, Encoded, getVaSdstBitShift(), getVaSdstBitWidth());
}
-unsigned encodeFieldVaSdst(unsigned VaSdst) {
- return encodeFieldVaSdst(0xffff, VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaSdst(Encoded, VaSdst);
}
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc) {
return packBits(VaVcc, Encoded, getVaVccBitShift(), getVaVccBitWidth());
}
-unsigned encodeFieldVaVcc(unsigned VaVcc) {
- return encodeFieldVaVcc(0xffff, VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaVcc(Encoded, VaVcc);
}
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc) {
return packBits(VaSsrc, Encoded, getVaSsrcBitShift(), getVaSsrcBitWidth());
}
-unsigned encodeFieldVaSsrc(unsigned VaSsrc) {
- return encodeFieldVaSsrc(0xffff, VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldVaSsrc(Encoded, VaSsrc);
}
unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt) {
return packBits(HoldCnt, Encoded, getHoldCntBitShift(), getHoldCntWidth());
}
-unsigned encodeFieldHoldCnt(unsigned HoldCnt) {
- return encodeFieldHoldCnt(0xffff, HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI) {
+ unsigned Encoded = getDefaultDepCtrEncoding(STI);
+ return encodeFieldHoldCnt(Encoded, HoldCnt);
}
} // namespace DepCtr
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9f65f9326a73e..4ca328eb06303 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1301,43 +1301,43 @@ unsigned decodeFieldVaSsrc(unsigned Encoded);
unsigned decodeFieldHoldCnt(unsigned Encoded);
/// \returns \p VmVsrc as an encoded Depctr immediate.
-unsigned encodeFieldVmVsrc(unsigned VmVsrc);
+unsigned encodeFieldVmVsrc(unsigned VmVsrc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VmVsrc.
unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc);
/// \returns \p VaVdst as an encoded Depctr immediate.
-unsigned encodeFieldVaVdst(unsigned VaVdst);
+unsigned encodeFieldVaVdst(unsigned VaVdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaVdst.
unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst);
/// \returns \p SaSdst as an encoded Depctr immediate.
-unsigned encodeFieldSaSdst(unsigned SaSdst);
+unsigned encodeFieldSaSdst(unsigned SaSdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p SaSdst.
unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst);
/// \returns \p VaSdst as an encoded Depctr immediate.
-unsigned encodeFieldVaSdst(unsigned VaSdst);
+unsigned encodeFieldVaSdst(unsigned VaSdst, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaSdst.
unsigned encodeFieldVaSdst(unsigned Encoded, unsigned VaSdst);
/// \returns \p VaVcc as an encoded Depctr immediate.
-unsigned encodeFieldVaVcc(unsigned VaVcc);
+unsigned encodeFieldVaVcc(unsigned VaVcc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaVcc.
unsigned encodeFieldVaVcc(unsigned Encoded, unsigned VaVcc);
/// \returns \p HoldCnt as an encoded Depctr immediate.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt);
+unsigned encodeFieldHoldCnt(unsigned HoldCnt, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p HoldCnt.
-unsigned encodeFieldHoldCnt(unsigned HoldCnt, unsigned Encoded);
+unsigned encodeFieldHoldCnt(unsigned Encoded, unsigned HoldCnt);
/// \returns \p VaSsrc as an encoded Depctr immediate.
-unsigned encodeFieldVaSsrc(unsigned VaSsrc);
+unsigned encodeFieldVaSsrc(unsigned VaSsrc, const MCSubtargetInfo &STI);
/// \returns \p Encoded combined with encoded \p VaSsrc.
unsigned encodeFieldVaSsrc(unsigned Encoded, unsigned VaSsrc);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
index e11720011af10..d6f1b142b36e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.ll
@@ -50,7 +50,7 @@ define i16 @s_add_i16(i16 inreg %a, i16 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i16 %a, %b
@@ -145,7 +145,7 @@ define i32 @s_add_i32(i32 inreg %a, i32 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i32 %a, %b
@@ -263,11 +263,11 @@ define <2 x i16> @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX12-NEXT: s_lshr_b32 s2, s0, 16
; GFX12-NEXT: s_lshr_b32 s3, s1, 16
; GFX12-NEXT: s_add_co_i32 s0, s0, s1
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_add_co_i32 s2, s2, s3
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add <2 x i16> %a, %b
@@ -374,7 +374,7 @@ define i64 @s_add_i64(i64 inreg %a, i64 inreg %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i64 %a, %b
@@ -425,7 +425,7 @@ define i64 @v_add_i64(i64 %a, i64 %b) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo
; GFX12-NEXT: s_setpc_b64 s[30:31]
%c = add i64 %a, %b
@@ -513,7 +513,7 @@ define void @s_uaddo_uadde(i64 inreg %a, i64 inreg %b, ptr addrspace(1) %res, pt
; GFX12-NEXT: s_add_co_u32 s0, s0, s2
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3
; GFX12-NEXT: s_cselect_b32 s2, 1, 0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: v_mov_b32_e32 v6, s2
; GFX12-NEXT: global_store_b64 v[0:1], v[4:5], off
@@ -593,9 +593,9 @@ define void @v_uaddo_uadde(i64 %a, i64 %b, ptr addrspace(1) %res, ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: s_wait_alu 0xfffd
+; GFX12-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off
; GFX12-NEXT: global_store_b32 v[6:7], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index ff618c05e2b80..8063b29c29985 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -622,9 +622,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -779,9 +779,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1212,9 +1212,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB10_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1367,9 +1367,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1833,7 +1833,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -2000,7 +2000,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 007417c83e324..5b0b602bd99ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -622,9 +622,9 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB6_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -779,9 +779,9 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
-; GFX12-NEXT: s_wait_alu 0xfffe
+; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_wa...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169378
More information about the llvm-commits
mailing list