[llvm] [AMDGPU] Change default loop alignment (PR #155343)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 19:15:44 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: None (hjagasiaAMD)
<details>
<summary>Changes</summary>
Align small loops aggresively to 32 bytes and larger loops to 16 bytes
---
Patch is 3.86 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/155343.diff
153 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+38-18)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll (+48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll (+48)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll (+5)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/is-safe-to-sink-bug.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/atomic-optimizer-strict-wqm.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll (+27)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+114)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+139)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+5)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+205)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+118)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+118)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/bug-sdag-emitcopyfromreg.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll (+19)
- (modified) llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/collapse-endcf.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll (+12)
- (modified) llvm/test/CodeGen/AMDGPU/copy-to-reg-frameindex.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/copy-to-reg.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+472-470)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+418-416)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+418-416)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+484-478)
- (modified) llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+303-144)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+160-160)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+303-144)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+303-144)
- (modified) llvm/test/CodeGen/AMDGPU/fold-fabs.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+628-628)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+456-456)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+456-456)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+542-542)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-atomics-min-max-system.ll (+128)
- (modified) llvm/test/CodeGen/AMDGPU/global-saddr-load.ll (+10)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+303-144)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+303-144)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+268-268)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+116-116)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+116-116)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+276-276)
- (modified) llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/infinite-loop.ll (+5)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+11)
- (modified) llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/issue139317-bad-opsel-reg-sequence-fold.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.pops.exiting.wave.id.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll (+25)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll (+25)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll (+27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll (+27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+242-242)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+192-192)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+192-192)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+282-282)
- (modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/loop-on-function-argument.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch.ll (+19-4)
- (modified) llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/loop_header_nopred.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/mdt-preserving-crash.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+33-3)
- (modified) llvm/test/CodeGen/AMDGPU/memmove-var-size.ll (+17)
- (modified) llvm/test/CodeGen/AMDGPU/mfma-loop.ll (+30)
- (modified) llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw-system.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/multilevel-break.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/no-dup-inst-prefetch.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+10)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-phi.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/select-undef.ll (+18)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/should-not-hoist-set-inactive.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/skip-if-dead.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+7)
- (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/structurize-hoist.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/swdev380865.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/trap-abis.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+7)
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/uniform-cfg.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/uniform-select.ll (+2)
- (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+6)
- (modified) llvm/test/CodeGen/AMDGPU/v_swap_b16.ll (+4)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll (+1)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+8)
- (modified) llvm/test/CodeGen/AMDGPU/while-break.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+7)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 66c1dfc71c2f5..13fc92f64e8b1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -17501,26 +17501,18 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(
Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
const Align CacheLineAlign = Align(64);
-
- // Pre-GFX10 target did not benefit from loop alignment
- if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
- getSubtarget()->hasInstFwdPrefetchBug())
+ if (!ML || DisableLoopAlignment)
return PrefAlign;
-
- // On GFX10 I$ is 4 x 64 bytes cache lines.
- // By default prefetcher keeps one cache line behind and reads two ahead.
- // We can modify it with S_INST_PREFETCH for larger loops to have two lines
- // behind and one ahead.
- // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
- // If loop fits 64 bytes it always spans no more than two cache lines and
- // does not need an alignment.
- // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
- // Else if loop is less or equal 192 bytes we need two lines behind.
-
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
const MachineBasicBlock *Header = ML->getHeader();
if (Header->getAlignment() != PrefAlign)
return Header->getAlignment(); // Already processed.
+ const MachineFunction *MF = Header->getParent();
+ const Function &Fn = MF->getFunction();
+ for (auto &BB : Fn)
+ for (auto &I : BB)
+ if (isa<llvm::UnreachableInst>(&I))
+ return PrefAlign;
unsigned LoopSize = 0;
for (const MachineBasicBlock *MBB : ML->blocks()) {
@@ -17531,13 +17523,41 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
for (const MachineInstr &MI : *MBB) {
LoopSize += TII->getInstSizeInBytes(MI);
- if (LoopSize > 192)
- return PrefAlign;
}
+ if (LoopSize > 192)
+ break;
+ }
+
+ if (!getSubtarget()->hasInstPrefetch() ||
+ getSubtarget()->hasInstFwdPrefetchBug()) {
+ // Align loops < 32 bytes agrressively
+ if (LoopSize <= 32)
+ return Align(32);
+ // Align larger loops less aggressively
+ if (!ML->isInnermost())
+ return PrefAlign;
+ return Align(16);
+ }
+
+ // On GFX10 I$ is 4 x 64 bytes cache lines.
+ // By default prefetcher keeps one cache line behind and reads two ahead.
+ // We can modify it with S_INST_PREFETCH for larger loops to have two lines
+ // behind and one ahead.
+ // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
+ // If loop fits 64 bytes it always spans no more than two cache lines and
+ // does not need an alignment driven by prefetch considerations.
+ // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
+ // Else if loop is less or equal 192 bytes we need two lines behind.
+
+ // Align larger loops less aggressively
+ if (LoopSize > 192) {
+ if (!ML->isInnermost())
+ return PrefAlign;
+ return Align(16);
}
if (LoopSize <= 64)
- return PrefAlign;
+ return Align(32);
if (LoopSize <= 128)
return CacheLineAlign;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 666523c88860c..969a86c9810bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -331,6 +331,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX942-NEXT: global_load_dword v3, v[0:1], off
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT: .p2align
; GFX942-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -376,6 +377,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT: .p2align
; GFX90A-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -400,6 +402,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -424,6 +427,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -478,6 +482,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX942-NEXT: global_load_dword v3, v[0:1], off
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX942-NEXT: .p2align
; GFX942-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -522,6 +527,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: .p2align
; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -545,6 +551,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -568,6 +575,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -610,6 +618,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .p2align
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -648,6 +657,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .p2align
; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -694,6 +704,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -720,6 +731,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -767,6 +779,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .p2align
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -804,6 +817,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .p2align
; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -849,6 +863,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -873,6 +888,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -926,6 +942,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: flat_load_dword v3, v[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT: .p2align
; GFX942-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -971,6 +988,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX90A-NEXT: .p2align
; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -995,6 +1013,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1019,6 +1038,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1069,6 +1089,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX942-NEXT: flat_load_dword v3, v[0:1]
; GFX942-NEXT: s_mov_b64 s[0:1], 0
; GFX942-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX942-NEXT: .p2align
; GFX942-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1115,6 +1136,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX90A-NEXT: .p2align
; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1138,6 +1160,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v4, v2, v2
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1161,6 +1184,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v2
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1200,6 +1224,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .p2align
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1238,6 +1263,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .p2align
; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1284,6 +1310,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX908-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1313,6 +1340,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1355,6 +1383,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .p2align
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1392,6 +1421,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .p2align
; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1438,6 +1468,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX908-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1465,6 +1496,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[2:3]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1518,6 +1550,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX942-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v3, v1, v1
+; GFX942-NEXT: .p2align
; GFX942-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -1567,6 +1600,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX90A-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v1, v1
+; GFX90A-NEXT: .p2align
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1593,6 +1627,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX908-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v3, v1, v1
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1620,6 +1655,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
; GFX8-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v1
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1674,6 +1710,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX942-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
; GFX942-NEXT: s_mov_b64 s[4:5], 0
; GFX942-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT: .p2align
; GFX942-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_waitcnt vmcnt(0)
@@ -1722,6 +1759,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX90A-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT: .p2align
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -1747,6 +1785,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX908-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT: .p2align
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -1773,6 +1812,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
; GFX8-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT: .p2align
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1817,6 +1857,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX12-NEXT: .p2align
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -1859,6 +1900,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_loa...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/155343
More information about the llvm-commits
mailing list