[llvm] [AMDGPU] Mark ASYNCMARK as meta instruction to fix hazard cycle miscounting (PR #189981)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 1 08:40:03 PDT 2026
https://github.com/adeshcom14 created https://github.com/llvm/llvm-project/pull/189981
ASYNCMARK emits no hardware code it is used for tracking purpose but was not marked as meta, causing getNumWaitStates to return 1 and GCNHazardRecognizer to incorrectly count it as a pipeline cycle.
This patch marks ASYNCMARK as meta-Instruction so it correctly reports 0 wait states.
Fixes [LCOMPILER-1681]
>From 0a53c0bb82c4831b13be7687996cc5c03cfc0352 Mon Sep 17 00:00:00 2001
From: Adesh Adikane <aadikane at amd.com>
Date: Wed, 1 Apr 2026 14:34:54 +0000
Subject: [PATCH 1/2] Mark Asyncmark as meta Instr
---
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 6 ++++--
llvm/lib/Target/AMDGPU/SOPInstructions.td | 1 +
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 4524628554fc4..ed428099718f6 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2417,7 +2417,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
LLVM_DEBUG(dbgs() << "\n*** GenerateWaitcntInstBefore: "; MI.print(dbgs()););
setForceEmitWaitcnt();
- assert(!MI.isMetaInstruction());
+ assert(!MI.isMetaInstruction() || MI.getOpcode() == AMDGPU::ASYNCMARK);
AMDGPU::Waitcnt Wait;
const unsigned Opc = MI.getOpcode();
@@ -3263,7 +3263,9 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
E = Block.instr_end();
Iter != E; ++Iter) {
MachineInstr &Inst = *Iter;
- if (Inst.isMetaInstruction())
+ // ASYNCMARK is meta instr but needs processing by
+ // generateWaitcntInstBefore and recordAsyncMark for vmcnt tracking.
+ if (Inst.isMetaInstruction() && Inst.getOpcode() != AMDGPU::ASYNCMARK)
continue;
// Track pre-existing waitcnts that were added in earlier iterations or by
// the memory legalizer.
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 4483853fd0234..61ea62dbeef30 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1731,6 +1731,7 @@ let SubtargetPredicate = HasVMemToLDSLoad in {
def ASYNCMARK : SPseudoInstSI<(outs), (ins),
[(int_amdgcn_asyncmark)]> {
let maybeAtomic = 0;
+ let isMeta = 1;
}
def WAIT_ASYNCMARK : SOPP_Pseudo <"", (ins s16imm:$simm16), "$simm16",
[(int_amdgcn_wait_asyncmark timm:$simm16)]> {
>From 71e9c910386a8e11d59bd12b602e5c65bf434bd3 Mon Sep 17 00:00:00 2001
From: Adesh Adikane <aadikane at amd.com>
Date: Wed, 1 Apr 2026 14:36:17 +0000
Subject: [PATCH 2/2] Regenerated tests after fix
---
llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll | 6 ++++--
llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll | 6 ++++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll b/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
index c6028497c941f..184fd61c355f0 100644
--- a/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/async-buffer-loads.ll
@@ -7,12 +7,13 @@ define float @raw.buffer.load(<4 x i32> inreg %rsrc, ptr addrspace(3) inreg %lds
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 m0, s20
-; CHECK-NEXT: v_mov_b32_e32 v0, s20
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
+; CHECK-NEXT: v_mov_b32_e32 v0, s20
; CHECK-NEXT: ; wait_asyncmark(1)
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_read_b32 v0, v0
@@ -34,12 +35,13 @@ define float @raw.ptr.buffer.load(ptr addrspace(8) inreg %rsrc, ptr addrspace(3)
; CHECK: ; %bb.0: ; %main_body
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 m0, s20
-; CHECK-NEXT: v_mov_b32_e32 v0, s20
+; CHECK-NEXT: s_nop 0
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:4 glc lds
; CHECK-NEXT: ; asyncmark
; CHECK-NEXT: buffer_load_dword off, s[16:19], 0 offset:8 slc lds
+; CHECK-NEXT: v_mov_b32_e32 v0, s20
; CHECK-NEXT: ; wait_asyncmark(1)
; CHECK-NEXT: s_waitcnt vmcnt(2)
; CHECK-NEXT: ds_read_b32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
index 41292bf4aa829..366617e226629 100644
--- a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
@@ -359,10 +359,11 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2
; WITHASYNC-NEXT: s_mov_b32 m0, s4
-; WITHASYNC-NEXT: v_mov_b32_e32 v5, 0
+; WITHASYNC-NEXT: s_nop 0
; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
; WITHASYNC-NEXT: ; asyncmark
; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
+; WITHASYNC-NEXT: v_mov_b32_e32 v5, 0
; WITHASYNC-NEXT: s_mov_b32 s6, 2
; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0
; WITHASYNC-NEXT: ; asyncmark
@@ -399,10 +400,11 @@ define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, p
; WITHOUT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; WITHOUT-NEXT: v_readfirstlane_b32 s4, v2
; WITHOUT-NEXT: s_mov_b32 m0, s4
-; WITHOUT-NEXT: v_mov_b32_e32 v5, 0
+; WITHOUT-NEXT: s_nop 0
; WITHOUT-NEXT: global_load_dword v[0:1], off lds
; WITHOUT-NEXT: ; asyncmark
; WITHOUT-NEXT: global_load_dword v[0:1], off lds
+; WITHOUT-NEXT: v_mov_b32_e32 v5, 0
; WITHOUT-NEXT: s_mov_b32 s6, 2
; WITHOUT-NEXT: s_mov_b64 s[4:5], 0
; WITHOUT-NEXT: ; asyncmark
More information about the llvm-commits
mailing list