[llvm] [AMDGPU] Align loop headers to prevent instruction fetch split on GFX950 (PR #181999)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 19 02:58:47 PST 2026
https://github.com/michaelselehov updated https://github.com/llvm/llvm-project/pull/181999
>From ce48525aac76d715e5e01a04a7abbf7ab7d5e463 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Wed, 18 Feb 2026 05:35:15 -0600
Subject: [PATCH 1/4] [AMDGPU] Align loop headers to prevent instruction fetch
split on GFX950
On GFX9, the instruction sequencer fetches 32 bytes at a time. When
an 8-byte instruction at a loop header straddles a 32-byte fetch window
boundary, the sequencer must perform two fetches after a backward branch,
incurring a delay. On GFX950, this causes additional performance issues.
This patch adds 32-byte alignment (.p2align 5, , 4) for loop headers
on GFX950 when the first real instruction is 8 bytes. At most one s_nop
(4 bytes, 1 quad-cycle before the loop) is used for padding. If more
than 4 bytes of padding were needed, the 8-byte instruction would not
straddle a 32-byte boundary anyway, so alignment is skipped.
Note: the alignment decision is made during block-placement, before
si-insert-waitcnts. In loops where a 4-byte S_WAITCNT is later inserted
as the first instruction, the alignment becomes redundant but mostly harmless
(at most one extra s_nop per affected loop).
Assisted-by: Claude (Anthropic)
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 33 +++++++++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 7 ++
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 68 +++++++++++++++++
.../CodeGen/AMDGPU/a-v-global-atomicrmw.ll | 68 +++++++++++++++++
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 10 +++
.../AMDGPU/loop-header-align-gfx950.mir | 74 +++++++++++++++++++
6 files changed, 260 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/loop-header-align-gfx950.mir
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5b83574dee0c3..338a75fc1e2a3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18741,6 +18741,19 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
const Align CacheLineAlign = Align(64);
+ // GFX950: Prevent an 8-byte instruction at loop header from being split by
+ // the 32-byte instruction fetch window boundary. This avoids a significant
+ // fetch delay after backward branch. We use 32-byte alignment with max
+ // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
+ if (ML && !DisableLoopAlignment && getSubtarget()->hasGFX950Insts()) {
+ const MachineBasicBlock *Header = ML->getHeader();
+ // Respect user-specified or previously set alignment.
+ if (Header->getAlignment() != PrefAlign)
+ return Header->getAlignment();
+ if (needsFetchWindowAlignment(Header))
+ return Align(32);
+ }
+
// Pre-GFX10 target did not benefit from loop alignment
if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
getSubtarget()->hasInstFwdPrefetchBug())
@@ -18811,6 +18824,26 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
return CacheLineAlign;
}
+unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
+ MachineBasicBlock *MBB) const {
+ // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
+ // instruction could be split by the 32-byte fetch window boundary.
+ // See getPrefLoopAlignment() for context.
+ if (needsFetchWindowAlignment(MBB))
+ return 4;
+ return TargetLowering::getMaxPermittedBytesForAlignment(MBB);
+}
+
+bool SITargetLowering::needsFetchWindowAlignment(
+ const MachineBasicBlock *MBB) const {
+ if (!getSubtarget()->hasGFX950Insts() || !MBB)
+ return false;
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ auto I = MBB->getFirstNonDebugInstr();
+ // Only 8-byte instructions can be split by a 32-byte boundary.
+ return I != MBB->end() && TII->getInstSizeInBytes(*I) == 8;
+}
+
[[maybe_unused]]
static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
assert(N->getOpcode() == ISD::CopyFromReg);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 59b8f434957ce..a17037915231e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -286,6 +286,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
EVT PtrVT) const override;
private:
+ /// Returns true if the first real instruction in MBB is 8 bytes and could
+ /// be split by a 32-byte fetch window boundary. Used on GFX950 to avoid
+ /// instruction fetch delays.
+ bool needsFetchWindowAlignment(const MachineBasicBlock *MBB) const;
+
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]
// array pointed to by Offsets.
@@ -590,6 +595,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool requiresUniformRegister(MachineFunction &MF,
const Value *V) const override;
Align getPrefLoopAlignment(MachineLoop *ML) const override;
+ unsigned
+ getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override;
void allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 0a300674df505..130a4c2c92c73 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -5566,6 +5566,7 @@ define void @flat_atomic_nand_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5627,6 +5628,7 @@ define void @flat_atomic_nand_i32_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6318,6 +6320,7 @@ define void @flat_atomic_usub_sat_i32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6378,6 +6381,7 @@ define void @flat_atomic_usub_sat_i32_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9692,6 +9696,7 @@ define void @flat_atomic_fmaximum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9755,6 +9760,7 @@ define void @flat_atomic_fmaximum_f32_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9820,6 +9826,7 @@ define void @flat_atomic_fminimum_f32_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9883,6 +9890,7 @@ define void @flat_atomic_fminimum_f32_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10251,6 +10259,7 @@ define void @flat_atomic_fsub_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB129_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10365,6 +10374,7 @@ define void @flat_atomic_fsub_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB130_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10868,6 +10878,7 @@ define void @flat_atomic_fmaximum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB135_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10999,6 +11010,7 @@ define void @flat_atomic_fmaximum_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB136_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11135,6 +11147,7 @@ define void @flat_atomic_fminimum_f64_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB137_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11266,6 +11279,7 @@ define void @flat_atomic_fminimum_f64_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB138_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11454,6 +11468,7 @@ define void @flat_atomic_fsub_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB141_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11514,6 +11529,7 @@ define void @flat_atomic_fsub_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11579,6 +11595,7 @@ define void @flat_atomic_fmax_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB143_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11644,6 +11661,7 @@ define void @flat_atomic_fmax_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11711,6 +11729,7 @@ define void @flat_atomic_fmin_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB145_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11776,6 +11795,7 @@ define void @flat_atomic_fmin_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11847,6 +11867,7 @@ define void @flat_atomic_fmaximum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB147_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11914,6 +11935,7 @@ define void @flat_atomic_fmaximum_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11983,6 +12005,7 @@ define void @flat_atomic_fminimum_v2f16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB149_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12050,6 +12073,7 @@ define void @flat_atomic_fminimum_v2f16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12269,6 +12293,7 @@ define void @flat_atomic_fsub_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB153_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12353,6 +12378,7 @@ define void @flat_atomic_fsub_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12439,6 +12465,7 @@ define void @flat_atomic_fmax_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB155_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12523,6 +12550,7 @@ define void @flat_atomic_fmax_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12609,6 +12637,7 @@ define void @flat_atomic_fmin_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB157_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12693,6 +12722,7 @@ define void @flat_atomic_fmin_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12784,6 +12814,7 @@ define void @flat_atomic_fmaximum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB159_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12873,6 +12904,7 @@ define void @flat_atomic_fmaximum_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12964,6 +12996,7 @@ define void @flat_atomic_fminimum_v2bf16_ret_a_a(ptr %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB161_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13053,6 +13086,7 @@ define void @flat_atomic_fminimum_v2bf16_ret_av_av(ptr %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13444,6 +13478,7 @@ define void @flat_atomic_nand_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13509,6 +13544,7 @@ define void @flat_atomic_nand_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14347,6 +14383,7 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -14411,6 +14448,7 @@ define void @flat_atomic_usub_sat_i32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -18014,6 +18052,7 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -18081,6 +18120,7 @@ define void @flat_atomic_fmaximum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -18150,6 +18190,7 @@ define void @flat_atomic_fminimum_f32_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -18217,6 +18258,7 @@ define void @flat_atomic_fminimum_f32_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -18566,6 +18608,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB237_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -18676,6 +18719,7 @@ define void @flat_atomic_fsub_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB238_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19163,6 +19207,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB243_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19290,6 +19335,7 @@ define void @flat_atomic_fmaximum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB244_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19422,6 +19468,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB245_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19549,6 +19596,7 @@ define void @flat_atomic_fminimum_f64_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: flat_load_dwordx2 v[2:3], v[6:7]
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB246_2: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19746,6 +19794,7 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19810,6 +19859,7 @@ define void @flat_atomic_fsub_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19879,6 +19929,7 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -19948,6 +19999,7 @@ define void @flat_atomic_fmax_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20019,6 +20071,7 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20088,6 +20141,7 @@ define void @flat_atomic_fmin_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_pk_max_f16 v4, v0, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20163,6 +20217,7 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20234,6 +20289,7 @@ define void @flat_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20307,6 +20363,7 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20378,6 +20435,7 @@ define void @flat_atomic_fminimum_v2f16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20609,6 +20667,7 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB261_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20697,6 +20756,7 @@ define void @flat_atomic_fsub_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20787,6 +20847,7 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB263_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20875,6 +20936,7 @@ define void @flat_atomic_fmax_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -20965,6 +21027,7 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB265_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21053,6 +21116,7 @@ define void @flat_atomic_fmin_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21148,6 +21212,7 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB267_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21241,6 +21306,7 @@ define void @flat_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21336,6 +21402,7 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB269_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -21429,6 +21496,7 @@ define void @flat_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr inreg %ptr) #0 {
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
index 76ef16ad33462..37cad3c4596d8 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-global-atomicrmw.ll
@@ -4022,6 +4022,7 @@ define void @global_atomic_nand_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -4083,6 +4084,7 @@ define void @global_atomic_nand_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -4774,6 +4776,7 @@ define void @global_atomic_usub_sat_i32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -4834,6 +4837,7 @@ define void @global_atomic_usub_sat_i32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -6536,6 +6540,7 @@ define void @global_atomic_fmaximum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB123_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -6599,6 +6604,7 @@ define void @global_atomic_fmaximum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -6664,6 +6670,7 @@ define void @global_atomic_fminimum_f32_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -6727,6 +6734,7 @@ define void @global_atomic_fminimum_f32_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -6871,6 +6879,7 @@ define void @global_atomic_fsub_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB129_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -6932,6 +6941,7 @@ define void @global_atomic_fsub_f64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7152,6 +7162,7 @@ define void @global_atomic_fmaximum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB135_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7222,6 +7233,7 @@ define void @global_atomic_fmaximum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB136_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7296,6 +7308,7 @@ define void @global_atomic_fminimum_f64_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v7, a1
; GFX950-NEXT: v_accvgpr_read_b32 v6, a0
; GFX950-NEXT: v_mov_b32_e32 v8, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB137_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7366,6 +7379,7 @@ define void @global_atomic_fminimum_f64_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[6:7]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB138_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7507,6 +7521,7 @@ define void @global_atomic_fsub_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB141_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7567,6 +7582,7 @@ define void @global_atomic_fsub_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB142_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7632,6 +7648,7 @@ define void @global_atomic_fmax_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB143_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7697,6 +7714,7 @@ define void @global_atomic_fmax_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB144_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7764,6 +7782,7 @@ define void @global_atomic_fmin_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB145_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7829,6 +7848,7 @@ define void @global_atomic_fmin_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_pk_max_f16 v4, v2, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB146_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7900,6 +7920,7 @@ define void @global_atomic_fmaximum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB147_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -7967,6 +7988,7 @@ define void @global_atomic_fmaximum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB148_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8036,6 +8058,7 @@ define void @global_atomic_fminimum_v2f16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB149_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8103,6 +8126,7 @@ define void @global_atomic_fminimum_v2f16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v4
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB150_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8322,6 +8346,7 @@ define void @global_atomic_fsub_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB153_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8406,6 +8431,7 @@ define void @global_atomic_fsub_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB154_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8492,6 +8518,7 @@ define void @global_atomic_fmax_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB155_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8576,6 +8603,7 @@ define void @global_atomic_fmax_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB156_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8662,6 +8690,7 @@ define void @global_atomic_fmin_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB157_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8746,6 +8775,7 @@ define void @global_atomic_fmin_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB158_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8837,6 +8867,7 @@ define void @global_atomic_fmaximum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB159_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -8926,6 +8957,7 @@ define void @global_atomic_fmaximum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB160_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -9017,6 +9049,7 @@ define void @global_atomic_fminimum_v2bf16_ret_a_a(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: v_accvgpr_read_b32 v2, a0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB161_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -9106,6 +9139,7 @@ define void @global_atomic_fminimum_v2bf16_ret_av_av(ptr addrspace(1) %ptr) #0 {
; GFX950-NEXT: s_mov_b64 s[0:1], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB162_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -9479,6 +9513,7 @@ define void @global_atomic_nand_i32_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB171_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -9542,6 +9577,7 @@ define void @global_atomic_nand_i32_saddr_ret_av_av(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB172_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -10342,6 +10378,7 @@ define void @global_atomic_usub_sat_i32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB191_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -10404,6 +10441,7 @@ define void @global_atomic_usub_sat_i32_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB192_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12322,6 +12360,7 @@ define void @global_atomic_fmaximum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB231_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12387,6 +12426,7 @@ define void @global_atomic_fmaximum_f32_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB232_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12454,6 +12494,7 @@ define void @global_atomic_fminimum_f32_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB233_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12519,6 +12560,7 @@ define void @global_atomic_fminimum_f32_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB234_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12668,6 +12710,7 @@ define void @global_atomic_fsub_f64_saddr_ret_a_a(ptr addrspace(1) inreg %ptr) #
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB237_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12731,6 +12774,7 @@ define void @global_atomic_fsub_f64_saddr_ret_av_av(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB238_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -12959,6 +13003,7 @@ define void @global_atomic_fmaximum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB243_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13031,6 +13076,7 @@ define void @global_atomic_fmaximum_f64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB244_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13107,6 +13153,7 @@ define void @global_atomic_fminimum_f64_saddr_ret_a_a(ptr addrspace(1) inreg %pt
; GFX950-NEXT: v_accvgpr_read_b32 v5, a1
; GFX950-NEXT: v_accvgpr_read_b32 v4, a0
; GFX950-NEXT: v_mov_b32_e32 v7, 0x7ff80000
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB245_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13179,6 +13226,7 @@ define void @global_atomic_fminimum_f64_saddr_ret_av_av(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB246_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13326,6 +13374,7 @@ define void @global_atomic_fsub_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB249_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13388,6 +13437,7 @@ define void @global_atomic_fsub_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB250_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13455,6 +13505,7 @@ define void @global_atomic_fmax_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB251_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13522,6 +13573,7 @@ define void @global_atomic_fmax_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB252_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13591,6 +13643,7 @@ define void @global_atomic_fmin_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr)
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB253_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13658,6 +13711,7 @@ define void @global_atomic_fmin_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg %pt
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_pk_max_f16 v3, v0, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB254_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13731,6 +13785,7 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB255_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13800,6 +13855,7 @@ define void @global_atomic_fmaximum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB256_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13871,6 +13927,7 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_a_a(ptr addrspace(1) inreg %
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB257_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -13940,6 +13997,7 @@ define void @global_atomic_fminimum_v2f16_saddr_ret_av_av(ptr addrspace(1) inreg
; GFX950-NEXT: ;;#ASMSTART
; GFX950-NEXT: ; def v3
; GFX950-NEXT: ;;#ASMEND
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB258_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14165,6 +14223,7 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB261_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14251,6 +14310,7 @@ define void @global_atomic_fsub_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB262_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14339,6 +14399,7 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB263_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14425,6 +14486,7 @@ define void @global_atomic_fmax_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB264_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14513,6 +14575,7 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg %ptr
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB265_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14599,6 +14662,7 @@ define void @global_atomic_fmin_v2bf16_saddr_ret_av_av(ptr addrspace(1) inreg %p
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB266_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14692,6 +14756,7 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB267_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14783,6 +14848,7 @@ define void @global_atomic_fmaximum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB268_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14876,6 +14942,7 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_a_a(ptr addrspace(1) inreg
; GFX950-NEXT: v_accvgpr_read_b32 v0, a0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB269_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
@@ -14967,6 +15034,7 @@ define void @global_atomic_fminimum_v2bf16_saddr_ret_av_av(ptr addrspace(1) inre
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB270_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 54fb38ba877ad..6194f71e3c10e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -13328,6 +13328,7 @@ define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-SDAG-NEXT: .p2align 5, , 4
; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13410,6 +13411,7 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-SDAG-NEXT: .p2align 5, , 4
; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13434,6 +13436,7 @@ define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-GISEL-NEXT: .p2align 5, , 4
; GFX950-GISEL-NEXT: .LBB125_1: ; %atomicrmw.start
; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13492,6 +13495,7 @@ define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %
; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-SDAG-NEXT: .p2align 5, , 4
; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13574,6 +13578,7 @@ define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX950-SDAG-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-SDAG-NEXT: .p2align 5, , 4
; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13598,6 +13603,7 @@ define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data
; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40
; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-GISEL-NEXT: .p2align 5, , 4
; GFX950-GISEL-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13703,6 +13709,7 @@ define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13760,6 +13767,7 @@ define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB131_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13819,6 +13827,7 @@ define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bflo
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13876,6 +13885,7 @@ define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %d
; GFX950-NEXT: s_mov_b64 s[2:3], 0
; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: .p2align 5, , 4
; GFX950-NEXT: .LBB133_1: ; %atomicrmw.start
; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/loop-header-align-gfx950.mir b/llvm/test/CodeGen/AMDGPU/loop-header-align-gfx950.mir
new file mode 100644
index 0000000000000..bd32f319277c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/loop-header-align-gfx950.mir
@@ -0,0 +1,74 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=block-placement -o - %s | FileCheck %s
+
+# Test that loop headers are aligned to 32 bytes on GFX950 when the first
+# instruction is 8 bytes, to prevent the instruction from being split by the
+# 32-byte fetch window boundary.
+# The second test case verifies that 4-byte instructions do NOT trigger
+# alignment (CHECK-NEXT chain would break if .p2align were inserted).
+
+---
+name: loop_with_8byte_first_inst
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: loop_with_8byte_first_inst:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: s_mov_b64 s[0:1], 0
+ ; CHECK-NEXT: .p2align 5, , 4
+ ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+ ; CHECK-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0_vgpr1
+
+ renamable $sgpr0_sgpr1 = S_MOV_B64 0
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+
+ renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 1, killed renamable $vgpr0_vgpr1, implicit $exec
+ V_CMP_EQ_U64_e32 0, $vgpr0_vgpr1, implicit-def $vcc, implicit $exec
+ renamable $sgpr0_sgpr1 = S_OR_B64 killed renamable $vcc, killed renamable $sgpr0_sgpr1, implicit-def $scc
+ $exec = S_ANDN2_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
+ S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+ bb.2:
+ liveins: $sgpr0_sgpr1
+
+ $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
+ S_SETPC_B64_return undef $sgpr30_sgpr31
+...
+
+---
+name: loop_with_4byte_first_inst
+tracksRegLiveness: true
+body: |
+ ; CHECK-LABEL: loop_with_4byte_first_inst:
+ ; CHECK: ; %bb.0:
+ ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; CHECK-NEXT: s_mov_b64 s[0:1], 0
+ ; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+ ; CHECK-NEXT: v_add_u32_e32 v0, 1, v0
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $vgpr0
+
+ renamable $sgpr0_sgpr1 = S_MOV_B64 0
+
+ bb.1:
+ successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+ liveins: $sgpr0_sgpr1, $vgpr0
+
+ renamable $vgpr0 = V_ADD_U32_e32 1, killed $vgpr0, implicit $exec
+ V_CMP_LT_U32_e32 10, $vgpr0, implicit-def $vcc, implicit $exec
+ renamable $sgpr0_sgpr1 = S_OR_B64 killed renamable $vcc, killed renamable $sgpr0_sgpr1, implicit-def $scc
+ $exec = S_ANDN2_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
+ S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+ bb.2:
+ liveins: $sgpr0_sgpr1
+
+ $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
+ S_SETPC_B64_return undef $sgpr30_sgpr31
+...
>From 30b7f92f9ecfd9ebaa4738a29f35d2cf5c075cf9 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Wed, 18 Feb 2026 10:41:49 -0600
Subject: [PATCH 2/4] Skipping all meta instructions instead of only debug
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 338a75fc1e2a3..0379ac582f0b0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18839,9 +18839,13 @@ bool SITargetLowering::needsFetchWindowAlignment(
if (!getSubtarget()->hasGFX950Insts() || !MBB)
return false;
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- auto I = MBB->getFirstNonDebugInstr();
- // Only 8-byte instructions can be split by a 32-byte boundary.
- return I != MBB->end() && TII->getInstSizeInBytes(*I) == 8;
+ for (const MachineInstr &MI : *MBB) {
+ if (MI.isMetaInstruction())
+ continue;
+ // Instructions larger than 4 bytes can be split by a 32-byte boundary.
+ return TII->getInstSizeInBytes(MI) > 4;
+ }
+ return false;
}
[[maybe_unused]]
>From 5f41f84b8e6c10ec66cb1d26a0d9b7f58b7b8496 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 19 Feb 2026 03:28:53 -0600
Subject: [PATCH 3/4] Helper to check for gfx950
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 2 ++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +++--
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index b308e0d77305f..625f6a3dfa719 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -576,6 +576,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
+ bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
+
bool requiresCodeObjectV6() const { return RequiresCOV6; }
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0379ac582f0b0..97da96fc7db0a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18745,7 +18745,8 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
// the 32-byte instruction fetch window boundary. This avoids a significant
// fetch delay after backward branch. We use 32-byte alignment with max
// padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
- if (ML && !DisableLoopAlignment && getSubtarget()->hasGFX950Insts()) {
+ if (ML && !DisableLoopAlignment &&
+ getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
const MachineBasicBlock *Header = ML->getHeader();
// Respect user-specified or previously set alignment.
if (Header->getAlignment() != PrefAlign)
@@ -18836,7 +18837,7 @@ unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
bool SITargetLowering::needsFetchWindowAlignment(
const MachineBasicBlock *MBB) const {
- if (!getSubtarget()->hasGFX950Insts() || !MBB)
+ if (!getSubtarget()->hasLoopHeadInstSplitSensitivity() || !MBB)
return false;
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
for (const MachineInstr &MI : *MBB) {
>From 9602d2faa27ed0eac1d041f3c9acd0a8562f57ce Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov at amd.com>
Date: Thu, 19 Feb 2026 04:58:32 -0600
Subject: [PATCH 4/4] Changed pointer to reference, added comment about gfx950
---
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 ++++
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +++++-----
llvm/lib/Target/AMDGPU/SIISelLowering.h | 2 +-
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 625f6a3dfa719..856838fbb2d4a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -576,6 +576,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
+ // All GFX9 targets experience a fetch delay when an instruction at the start
+ // of a loop header is split by a 32-byte fetch window boundary, but GFX950
+ // is uniquely sensitive to this: the delay triggers further performance
+ // degradation beyond the fetch latency itself.
bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
bool requiresCodeObjectV6() const { return RequiresCOV6; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 97da96fc7db0a..8533b1bd06d90 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -18751,7 +18751,7 @@ Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
// Respect user-specified or previously set alignment.
if (Header->getAlignment() != PrefAlign)
return Header->getAlignment();
- if (needsFetchWindowAlignment(Header))
+ if (needsFetchWindowAlignment(*Header))
return Align(32);
}
@@ -18830,17 +18830,17 @@ unsigned SITargetLowering::getMaxPermittedBytesForAlignment(
// GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
// instruction could be split by the 32-byte fetch window boundary.
// See getPrefLoopAlignment() for context.
- if (needsFetchWindowAlignment(MBB))
+ if (needsFetchWindowAlignment(*MBB))
return 4;
return TargetLowering::getMaxPermittedBytesForAlignment(MBB);
}
bool SITargetLowering::needsFetchWindowAlignment(
- const MachineBasicBlock *MBB) const {
- if (!getSubtarget()->hasLoopHeadInstSplitSensitivity() || !MBB)
+ const MachineBasicBlock &MBB) const {
+ if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
return false;
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- for (const MachineInstr &MI : *MBB) {
+ for (const MachineInstr &MI : MBB) {
if (MI.isMetaInstruction())
continue;
// Instructions larger than 4 bytes can be split by a 32-byte boundary.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a17037915231e..968e11b104abd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -289,7 +289,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Returns true if the first real instruction in MBB is 8 bytes and could
/// be split by a 32-byte fetch window boundary. Used on GFX950 to avoid
/// instruction fetch delays.
- bool needsFetchWindowAlignment(const MachineBasicBlock *MBB) const;
+ bool needsFetchWindowAlignment(const MachineBasicBlock &MBB) const;
// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
// the three offsets (voffset, soffset and instoffset) into the SDValue[3]
More information about the llvm-commits
mailing list