[clang] 6f233ce - [AMDGPU] Track tensor load/store DMAs with asyncmark (#200775)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 4 21:15:05 PDT 2026
Author: adeshcom14
Date: 2026-06-05T09:45:00+05:30
New Revision: 6f233ceb01347d37bd943dde84e81710ac24bfdf
URL: https://github.com/llvm/llvm-project/commit/6f233ceb01347d37bd943dde84e81710ac24bfdf
DIFF: https://github.com/llvm/llvm-project/commit/6f233ceb01347d37bd943dde84e81710ac24bfdf.diff
LOG: [AMDGPU] Track tensor load/store DMAs with asyncmark (#200775)
Wire existing variants of the tensor load-to/store-from LDS intrinsics
into the existing asyncmark/wait.asyncmark mechanism via TENSOR_CNT
waitcnt counter.
Fixes: LCOMPILER-1619
Added:
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.td
clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
llvm/docs/AMDGPUAsyncOperations.rst
llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index b15a36df6c08f..d8020bdcc8458 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -957,8 +957,14 @@ def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Ext
def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">;
def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts"> {
+ let Documentation = [DocTensorLoadToLDS_GFX1250];
+ let ArgNames = ["D0", "D1", "D2", "D3", "D4", "cpol"];
+}
+def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts"> {
+ let Documentation = [DocTensorStoreFromLDS_GFX1250];
+ let ArgNames = ["D0", "D1", "D2", "D3", "D4", "cpol"];
+}
def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
index cb2f000fcf548..97ae239e96ad5 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
@@ -615,3 +615,35 @@ scale variant.
matrix A or B data can be reused from a previous WMMA instruction.
}];
}
+
+//===----------------------------------------------------------------------===//
+// Tensor DMA Builtins
+//===----------------------------------------------------------------------===//
+
+def DocCatTensorDMA : DocumentationCategory<"Tensor DMA Builtins"> {
+ let Content = [{
+Asynchronous tensor DMA transfers between global memory and LDS. Track
+completion using ``TENSOR_CNT`` or *asyncmarks* (see
+:ref:`amdgpu-async-operations`).
+
+``D0``..``D4`` are the five chunks of the hardware tensor descriptor. The
+``_d2`` machine instruction is selected when ``D2`` and ``D3`` are zero-
+initialized; otherwise ``_d4`` is used. ``D4`` is reserved and silently
+ignored on gfx1250. ``cpol`` is a compile-time cache-policy bitfield
+(``th`` in bits [0:2], scope in bits [3:4]).
+}];
+}
+
+def DocTensorLoadToLDS_GFX1250 : Documentation {
+ let Category = DocCatTensorDMA;
+ let Content = [{
+Asynchronously copies a tensor from global memory into LDS.
+}];
+}
+
+def DocTensorStoreFromLDS_GFX1250 : Documentation {
+ let Category = DocCatTensorDMA;
+ let Content = [{
+Asynchronously copies a tensor from LDS into global memory.
+}];
+}
diff --git a/llvm/docs/AMDGPUAsyncOperations.rst b/llvm/docs/AMDGPUAsyncOperations.rst
index a51fe4ebb7e97..0b8ea0ae77174 100644
--- a/llvm/docs/AMDGPUAsyncOperations.rst
+++ b/llvm/docs/AMDGPUAsyncOperations.rst
@@ -50,6 +50,13 @@ memory and LDS memory.
void @llvm.amdgcn.global.store.async.from.lds.type(ptr %dst, ptr %src)
void @llvm.amdgcn.cluster.load.async.to.lds.type(ptr %dst, ptr %src)
+**GFX1250 Tensor DMA Instructions**
+
+.. code-block:: llvm
+
+ void @llvm.amdgcn.tensor.load.to.lds(...)
+ void @llvm.amdgcn.tensor.store.from.lds(...)
+
Asyncmark Operations
---------------------
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
index 282eaba6586a7..df8d22fb5e3dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
@@ -35,6 +35,8 @@ StringLiteral getInstCounterName(InstCounterType T) {
return "X_CNT";
case ASYNC_CNT:
return "ASYNC_CNT";
+ case TENSOR_CNT:
+ return "TENSOR_CNT";
case VA_VDST:
return "VA_VDST";
case VM_VSRC:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
index 24dbcdf8cc475..093d8a45d207b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
@@ -30,6 +30,7 @@ enum InstCounterType {
KM_CNT, // gfx12+ only.
X_CNT, // gfx1250.
ASYNC_CNT, // gfx1250.
+ TENSOR_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
VM_VSRC, // gfx12+ expert mode only.
@@ -77,7 +78,8 @@ class Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
- unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc)
+ unsigned AsyncCnt, unsigned TensorCnt, unsigned VaVdst,
+ unsigned VmVsrc)
: Waitcnt() {
Cnt[LOAD_CNT] = LoadCnt;
Cnt[DS_CNT] = DsCnt;
@@ -88,6 +90,7 @@ class Waitcnt {
Cnt[KM_CNT] = KmCnt;
Cnt[X_CNT] = XCnt;
Cnt[ASYNC_CNT] = AsyncCnt;
+ Cnt[TENSOR_CNT] = TensorCnt;
Cnt[VA_VDST] = VaVdst;
Cnt[VM_VSRC] = VmVsrc;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0ce3807395839..7241c0db726ce 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -161,7 +161,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
- DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
+ DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ \
+ DECL(TENSOR_ACCESS) /* access that uses TENSOR_CNT */
// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
@@ -221,7 +222,7 @@ static const unsigned
AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
- AMDGPU::S_WAIT_ASYNCCNT};
+ AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
// code but still need to be processed by this pass for async vmcnt tracking.
@@ -425,8 +426,9 @@ class WaitcntGenerator {
// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
- // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
- // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
+ // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
+ // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
+ // processed.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
virtual ~WaitcntGenerator() = default;
@@ -446,6 +448,8 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
WaitEventSet(),
WaitEventSet(),
WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
WaitEventSet()};
public:
@@ -482,6 +486,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
WaitEventSet({ASYNC_ACCESS}),
+ WaitEventSet({TENSOR_ACCESS}),
WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
VGPR_XDL_WRITE}),
WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
@@ -675,6 +680,8 @@ class SIInsertWaitcnts {
bool shouldUpdateAsyncMark(const MachineInstr &MI,
AMDGPU::InstCounterType T) const {
+ if (SIInstrInfo::usesTENSOR_CNT(MI))
+ return T == AMDGPU::TENSOR_CNT;
if (!isAsyncLdsDmaWrite(MI))
return false;
if (SIInstrInfo::usesASYNC_CNT(MI))
@@ -1835,6 +1842,8 @@ counterTypeForInstr(unsigned Opcode) {
return AMDGPU::X_CNT;
case AMDGPU::S_WAIT_ASYNCCNT:
return AMDGPU::ASYNC_CNT;
+ case AMDGPU::S_WAIT_TENSORCNT:
+ return AMDGPU::TENSOR_CNT;
default:
return {};
}
@@ -2090,8 +2099,8 @@ AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
- ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
- ExpertVal);
+ ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
+ ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -3057,9 +3066,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (SIInstrInfo::usesASYNC_CNT(Inst)) {
ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
}
+ } else if (SIInstrInfo::usesTENSOR_CNT(Inst)) {
+ ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst);
} else if (Inst.isCall()) {
- // Act as a wait on everything, but AsyncCnt is never included in such
- // blanket waits.
+ // Act as a wait on everything, but AsyncCnt and TensorCnt are never
+ // included in such blanket waits.
ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else if (TII.isVINTERP(Inst)) {
@@ -3813,7 +3824,7 @@ bool SIInsertWaitcnts::run() {
for (auto CT : inst_counter_types(AMDGPU::NUM_EXTENDED_INST_CNTS)) {
if (CT == AMDGPU::LOAD_CNT || CT == AMDGPU::DS_CNT ||
CT == AMDGPU::STORE_CNT || CT == AMDGPU::X_CNT ||
- CT == AMDGPU::ASYNC_CNT)
+ CT == AMDGPU::ASYNC_CNT || CT == AMDGPU::TENSOR_CNT)
continue;
if (!ST.hasImageInsts() &&
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 20ab23df208f8..831aa9ebb8435 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1078,6 +1078,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::ASYNC_CNT;
}
+ static bool usesTENSOR_CNT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::TENSOR_CNT;
+ }
+
+ bool usesTENSOR_CNT(uint32_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::TENSOR_CNT;
+ }
+
// Most sopk treat the immediate as a signed 16-bit, however some
// use it as unsigned.
static bool sopkIsZext(unsigned Opcode) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
index a8faa4620befa..9f30255a07095 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
@@ -5,6 +5,9 @@
; %D4 should be zero-initialized for gfx1250, which only supports 4 groups of tensor descriptor
declare void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol)
declare void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 %cpol)
+declare void @llvm.amdgcn.asyncmark()
+declare void @llvm.amdgcn.wait.asyncmark(i16)
+declare void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 %offset, i32 %cpol)
define amdgpu_ps void @tensor_load_to_lds_d4(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
; GFX1250-LABEL: tensor_load_to_lds_d4:
@@ -271,3 +274,248 @@ define amdgpu_ps void @tensor_store_from_lds_d5(<4 x i32> inreg %D0, <8 x i32> i
call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3, <8 x i32> %D4, i32 22)
ret void
}
+
+;=======================================================================
+; Tensor load/store DMAs are asynchronous and tracked via TENSOR_CNT.
+; Verify that they participate in the asyncmark / wait_asyncmark
+; mechanism.
+;========================================================================
+
+define amdgpu_ps void @tensor_load_to_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1) {
+; GFX1250-LABEL: tensor_load_to_lds_with_asyncmark:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11]
+; GFX1250-NEXT: ; asyncmark
+; GFX1250-NEXT: ; wait_asyncmark(0)
+; GFX1250-NEXT: s_wait_tensorcnt 0x0
+; GFX1250-NEXT: s_endpgm
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ ret void
+}
+
+define amdgpu_ps void @tensor_store_from_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1) {
+; GFX1250-LABEL: tensor_store_from_lds_with_asyncmark:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: tensor_store_from_lds s[0:3], s[4:11]
+; GFX1250-NEXT: ; asyncmark
+; GFX1250-NEXT: ; wait_asyncmark(0)
+; GFX1250-NEXT: s_wait_tensorcnt 0x0
+; GFX1250-NEXT: s_endpgm
+ call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ ret void
+}
+
+; Two outstanding tensor loads tracked by separate asyncmarks. The first
+; wait_asyncmark(1) drains down to one outstanding TENSOR_CNT entry, the
+; second wait_asyncmark(0) drains the remaining one.
+define amdgpu_ps void @tensor_load_to_lds_two_asyncmarks(<4 x i32> inreg %D0a, <8 x i32> inreg %D1a, <4 x i32> inreg %D0b, <8 x i32> inreg %D1b, ptr addrspace(3) %lds) {
+; GFX1250-LABEL: tensor_load_to_lds_two_asyncmarks:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[4:11]
+; GFX1250-NEXT: ; asyncmark
+; GFX1250-NEXT: tensor_load_to_lds s[12:15], s[16:23]
+; GFX1250-NEXT: ; asyncmark
+; GFX1250-NEXT: ; wait_asyncmark(1)
+; GFX1250-NEXT: s_wait_tensorcnt 0x1
+; GFX1250-NEXT: ds_load_b32 v1, v0
+; GFX1250-NEXT: ; wait_asyncmark(0)
+; GFX1250-NEXT: s_wait_tensorcnt 0x0
+; GFX1250-NEXT: ds_load_b32 v2, v0 offset:4
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1250-NEXT: ds_store_b32 v0, v1
+; GFX1250-NEXT: s_endpgm
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0a, <8 x i32> %D1a, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0b, <8 x i32> %D1b, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ %lds_v0 = load i32, ptr addrspace(3) %lds
+
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ %lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1
+ %lds_v1 = load i32, ptr addrspace(3) %lds_gep1
+
+ %sum = add i32 %lds_v0, %lds_v1
+ store i32 %sum, ptr addrspace(3) %lds
+
+ ret void
+}
+
+;=======================================================================
+; Mix ASYNC_CNT and TENSOR_CNT tracked operations under a single
+; asyncmark, and verify that a wait.asyncmark drains both counters.
+;========================================================================
+
+define void @tensor_and_async_lds_with_asyncmark(<4 x i32> inreg %D0, <8 x i32> inreg %D1, ptr addrspace(1) %src, ptr addrspace(3) %dst) {
+; GFX1250-LABEL: tensor_and_async_lds_with_asyncmark:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off
+; GFX1250-NEXT: tensor_load_to_lds s[0:3], s[16:23]
+; GFX1250-NEXT: ; asyncmark
+; GFX1250-NEXT: ; wait_asyncmark(0)
+; GFX1250-NEXT: s_wait_asynccnt 0x0
+; GFX1250-NEXT: s_wait_tensorcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+ call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 0, i32 0)
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ call void @llvm.amdgcn.wait.asyncmark(i16 0)
+ ret void
+}
+
+;=======================================================================
+; Two diamond if/else regions, each picking either a tensor DMA or an
+; async-LDS DMA, each followed by its own asyncmark.
+;========================================================================
+
+define void @tensor_or_async_lds_diamonds(i32 inreg %cond1, i32 inreg %cond2, <4 x i32> inreg %D0, <8 x i32> inreg %D1, ptr addrspace(1) %src, ptr addrspace(3) %dst) {
+; GFX1250-SDAG-LABEL: tensor_or_async_lds_diamonds:
+; GFX1250-SDAG: ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_mov_b32 s11, s25
+; GFX1250-SDAG-NEXT: s_mov_b32 s10, s24
+; GFX1250-SDAG-NEXT: s_mov_b32 s9, s23
+; GFX1250-SDAG-NEXT: s_mov_b32 s8, s22
+; GFX1250-SDAG-NEXT: s_mov_b32 s7, s21
+; GFX1250-SDAG-NEXT: s_mov_b32 s6, s20
+; GFX1250-SDAG-NEXT: s_mov_b32 s5, s19
+; GFX1250-SDAG-NEXT: s_mov_b32 s4, s18
+; GFX1250-SDAG-NEXT: s_mov_b32 s15, s17
+; GFX1250-SDAG-NEXT: s_mov_b32 s14, s16
+; GFX1250-SDAG-NEXT: s_mov_b32 s13, s3
+; GFX1250-SDAG-NEXT: s_mov_b32 s12, s2
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, -1
+; GFX1250-SDAG-NEXT: s_cbranch_scc1 .LBB14_6
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow1
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB14_3
+; GFX1250-SDAG-NEXT: .LBB14_2: ; %t1
+; GFX1250-SDAG-NEXT: tensor_load_to_lds s[12:15], s[4:11]
+; GFX1250-SDAG-NEXT: ; asyncmark
+; GFX1250-SDAG-NEXT: .LBB14_3: ; %merge1
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s0, -1
+; GFX1250-SDAG-NEXT: s_cbranch_scc1 .LBB14_7
+; GFX1250-SDAG-NEXT: ; %bb.4: ; %Flow
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB14_8
+; GFX1250-SDAG-NEXT: .LBB14_5: ; %merge2
+; GFX1250-SDAG-NEXT: ; wait_asyncmark(1)
+; GFX1250-SDAG-NEXT: s_wait_asynccnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_tensorcnt 0x0
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-SDAG-NEXT: .LBB14_6: ; %g1
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off
+; GFX1250-SDAG-NEXT: ; asyncmark
+; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB14_2
+; GFX1250-SDAG-NEXT: s_branch .LBB14_3
+; GFX1250-SDAG-NEXT: .LBB14_7: ; %g2
+; GFX1250-SDAG-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off
+; GFX1250-SDAG-NEXT: ; asyncmark
+; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB14_5
+; GFX1250-SDAG-NEXT: .LBB14_8: ; %t2
+; GFX1250-SDAG-NEXT: tensor_load_to_lds s[12:15], s[4:11]
+; GFX1250-SDAG-NEXT: ; asyncmark
+; GFX1250-SDAG-NEXT: ; wait_asyncmark(1)
+; GFX1250-SDAG-NEXT: s_wait_asynccnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_tensorcnt 0x1
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: tensor_or_async_lds_diamonds:
+; GFX1250-GISEL: ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s12, s2
+; GFX1250-GISEL-NEXT: s_mov_b32 s13, s3
+; GFX1250-GISEL-NEXT: s_mov_b32 s14, s16
+; GFX1250-GISEL-NEXT: s_mov_b32 s15, s17
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, s18
+; GFX1250-GISEL-NEXT: s_mov_b32 s5, s19
+; GFX1250-GISEL-NEXT: s_mov_b32 s6, s20
+; GFX1250-GISEL-NEXT: s_mov_b32 s7, s21
+; GFX1250-GISEL-NEXT: s_mov_b32 s8, s22
+; GFX1250-GISEL-NEXT: s_mov_b32 s9, s23
+; GFX1250-GISEL-NEXT: s_mov_b32 s10, s24
+; GFX1250-GISEL-NEXT: s_mov_b32 s11, s25
+; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB14_2
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %g1
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1250-GISEL-NEXT: ; asyncmark
+; GFX1250-GISEL-NEXT: .LBB14_2: ; %Flow1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s0, 1
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB14_4
+; GFX1250-GISEL-NEXT: ; %bb.3: ; %t1
+; GFX1250-GISEL-NEXT: tensor_load_to_lds s[12:15], s[4:11]
+; GFX1250-GISEL-NEXT: ; asyncmark
+; GFX1250-GISEL-NEXT: .LBB14_4: ; %merge1
+; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB14_6
+; GFX1250-GISEL-NEXT: ; %bb.5: ; %g2
+; GFX1250-GISEL-NEXT: global_load_async_to_lds_b32 v2, v[0:1], off
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX1250-GISEL-NEXT: ; asyncmark
+; GFX1250-GISEL-NEXT: .LBB14_6: ; %Flow
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s0, 1
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB14_8
+; GFX1250-GISEL-NEXT: ; %bb.7: ; %t2
+; GFX1250-GISEL-NEXT: tensor_load_to_lds s[12:15], s[4:11]
+; GFX1250-GISEL-NEXT: ; asyncmark
+; GFX1250-GISEL-NEXT: .LBB14_8: ; %merge2
+; GFX1250-GISEL-NEXT: ; wait_asyncmark(1)
+; GFX1250-GISEL-NEXT: s_wait_asynccnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_tensorcnt 0x0
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+entry:
+ %c1 = icmp ne i32 %cond1, 0
+ br i1 %c1, label %t1, label %g1
+
+t1:
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ br label %merge1
+
+g1:
+ call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ br label %merge1
+
+merge1:
+ %c2 = icmp ne i32 %cond2, 0
+ br i1 %c2, label %t2, label %g2
+
+t2:
+ call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ br label %merge2
+
+g2:
+ call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %src, ptr addrspace(3) %dst, i32 0, i32 0)
+ call void @llvm.amdgcn.asyncmark()
+ br label %merge2
+
+merge2:
+ call void @llvm.amdgcn.wait.asyncmark(i16 1)
+ ret void
+}
More information about the cfe-commits
mailing list