[clang] [llvm] [AMDGPU] Add async variants of tensor load/store LDS intrinsics. (PR #200775)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Jun 1 03:42:34 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: adeshcom14
<details>
<summary>Changes</summary>
Adds async variants of the tensor load-to/store-from LDS intrinsics and wires them into the existing asyncmark/wait.asyncmark mechanism via TENSOR_CNT waitcnt counter.
Fixes: LCOMPILER-1619
---
Patch is 41.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/200775.diff
15 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+2)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl (+66)
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+2)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+27-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+26-3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h (+4-1)
- (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+3-2)
- (modified) llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (+24-9)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+8)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.async.tensor.load.store.ll (+154)
- (modified) llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/sched-ldsdma-mask.mir (+24-24)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index b15a36df6c08f..12c62ec3d0f04 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -959,6 +959,8 @@ def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(
def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_load_async_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_store_async_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, unsigned int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
index 71b06e890ab9b..bf37f53a15734 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
@@ -76,3 +76,69 @@ void test_amdgcn_tensor_store_from_lds_d5(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8
{
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, 0);
}
+
+//=======================================================================
+// Async tensor load/store builtins. They lower to the same machine
+// instructions as the sync variants but are tracked by the asyncmark
+// mechanism.
+//========================================================================
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_async_to_lds_d4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_async_to_lds_d4(v4u sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_async_to_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_async_to_lds_d2(v4u sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_async_from_lds_d4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_async_from_lds_d4(v4u sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_async_from_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_async_from_lds_d2(v4u sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_async_to_lds_d5(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.async.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_async_to_lds_d5(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
+{
+ __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, sg2, sg3, sg4, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_async_from_lds_d5(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.async.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_async_from_lds_d5(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
+{
+ __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, sg2, sg3, sg4, 0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index c0d5a946f1e9f..1d867384bd514 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -189,6 +189,8 @@ void test_amdgcn_tensor_load_store(v4u sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4,
{
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_load_async_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_async_to_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_async_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_async_from_lds' must be a constant integer}}
}
void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2fd5285dfc330..85dffb66a0029 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -4222,6 +4222,8 @@ class AMDGPUTensorLoadStore:
def int_amdgcn_tensor_load_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
def int_amdgcn_tensor_store_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_load_async_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_async_to_lds">, AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_store_async_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_async_from_lds">, AMDGPUTensorLoadStore;
class AMDGPUClusterLoad<LLVMType ptr_ty>:
Intrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ecf8d957fc80f..66001106c5995 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3040,11 +3040,32 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
}
void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
- bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
+ bool IsLoad, IsAsync;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ IsLoad = true;
+ IsAsync = false;
+ break;
+ case Intrinsic::amdgcn_tensor_store_from_lds:
+ IsLoad = false;
+ IsAsync = false;
+ break;
+ case Intrinsic::amdgcn_tensor_load_async_to_lds:
+ IsLoad = true;
+ IsAsync = true;
+ break;
+ case Intrinsic::amdgcn_tensor_store_async_from_lds:
+ IsLoad = false;
+ IsAsync = true;
+ break;
+ default:
+ llvm_unreachable("not a tensor load/store intrinsic");
+ }
+
unsigned Opc =
IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
- SmallVector<SDValue, 7> TensorOps;
+ SmallVector<SDValue, 8> TensorOps;
// First two groups
TensorOps.push_back(N->getOperand(2)); // D# group 0
TensorOps.push_back(N->getOperand(3)); // D# group 1
@@ -3065,6 +3086,8 @@ void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
// for now because all existing targets only support up to 4 groups.
TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
TensorOps.push_back(N->getOperand(7)); // cache policy
+ TensorOps.push_back(
+ CurDAG->getTargetConstant(IsAsync, SDLoc(N), MVT::i1)); // IsAsync
TensorOps.push_back(N->getOperand(0)); // chain
(void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
@@ -3354,6 +3377,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
return;
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds:
+ case Intrinsic::amdgcn_tensor_load_async_to_lds:
+ case Intrinsic::amdgcn_tensor_store_async_from_lds:
SelectTensorLoadStore(N, IntrID);
return;
default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 463b8c40350b2..47407721ca325 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2470,6 +2470,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectGlobalLoadLds(I);
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds:
+ case Intrinsic::amdgcn_tensor_load_async_to_lds:
+ case Intrinsic::amdgcn_tensor_store_async_from_lds:
return selectTensorLoadStore(I, IntrinsicID);
case Intrinsic::amdgcn_asyncmark:
case Intrinsic::amdgcn_wait_asyncmark:
@@ -3872,7 +3874,27 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
Intrinsic::ID IID) const {
- bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
+ bool IsLoad, IsAsync;
+ switch (IID) {
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ IsLoad = true;
+ IsAsync = false;
+ break;
+ case Intrinsic::amdgcn_tensor_store_from_lds:
+ IsLoad = false;
+ IsAsync = false;
+ break;
+ case Intrinsic::amdgcn_tensor_load_async_to_lds:
+ IsLoad = true;
+ IsAsync = true;
+ break;
+ case Intrinsic::amdgcn_tensor_store_async_from_lds:
+ IsLoad = false;
+ IsAsync = true;
+ break;
+ default:
+ llvm_unreachable("not a tensor load/store intrinsic");
+ }
unsigned Opc =
IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_d4 : AMDGPU::TENSOR_STORE_FROM_LDS_d4;
int NumGroups = 4;
@@ -3904,8 +3926,9 @@ bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
.add(MI.getOperand(4)); // D# group 3
}
- MIB.addImm(0) // r128
- .add(MI.getOperand(6)); // cpol
+ MIB.addImm(0) // r128
+ .add(MI.getOperand(6)) // cpol
+ .addImm(IsAsync ? 1 : 0); // IsAsync
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a24df782cf28a..62c7991dec96f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3382,7 +3382,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_tensor_load_to_lds:
- case Intrinsic::amdgcn_tensor_store_from_lds: {
+ case Intrinsic::amdgcn_tensor_store_from_lds:
+ case Intrinsic::amdgcn_tensor_load_async_to_lds:
+ case Intrinsic::amdgcn_tensor_store_async_from_lds: {
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
constrainOpWithReadfirstlane(B, MI, 3);
@@ -5647,7 +5649,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
case Intrinsic::amdgcn_tensor_load_to_lds:
- case Intrinsic::amdgcn_tensor_store_from_lds: {
+ case Intrinsic::amdgcn_tensor_store_from_lds:
+ case Intrinsic::amdgcn_tensor_load_async_to_lds:
+ case Intrinsic::amdgcn_tensor_store_async_from_lds: {
// Lie and claim everything is legal, even all operands need to be
// SGPRs. applyMapping will have to deal with it with readfirstlane.
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
index 282eaba6586a7..df8d22fb5e3dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.cpp
@@ -35,6 +35,8 @@ StringLiteral getInstCounterName(InstCounterType T) {
return "X_CNT";
case ASYNC_CNT:
return "ASYNC_CNT";
+ case TENSOR_CNT:
+ return "TENSOR_CNT";
case VA_VDST:
return "VA_VDST";
case VM_VSRC:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
index 24dbcdf8cc475..093d8a45d207b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitcntUtils.h
@@ -30,6 +30,7 @@ enum InstCounterType {
KM_CNT, // gfx12+ only.
X_CNT, // gfx1250.
ASYNC_CNT, // gfx1250.
+ TENSOR_CNT, // gfx1250.
NUM_EXTENDED_INST_CNTS,
VA_VDST = NUM_EXTENDED_INST_CNTS, // gfx12+ expert mode only.
VM_VSRC, // gfx12+ expert mode only.
@@ -77,7 +78,8 @@ class Waitcnt {
// gfx12+ constructor.
Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt, unsigned XCnt,
- unsigned AsyncCnt, unsigned VaVdst, unsigned VmVsrc)
+ unsigned AsyncCnt, unsigned TensorCnt, unsigned VaVdst,
+ unsigned VmVsrc)
: Waitcnt() {
Cnt[LOAD_CNT] = LoadCnt;
Cnt[DS_CNT] = DsCnt;
@@ -88,6 +90,7 @@ class Waitcnt {
Cnt[KM_CNT] = KmCnt;
Cnt[X_CNT] = XCnt;
Cnt[ASYNC_CNT] = AsyncCnt;
+ Cnt[TENSOR_CNT] = TensorCnt;
Cnt[VA_VDST] = VaVdst;
Cnt[VM_VSRC] = VmVsrc;
}
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 0f31697f15688..34efcd0cf4cde 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2182,9 +2182,10 @@ class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
let hasSideEffects = 0;
bit UpTo2D = _UpTo2D;
- let InOperandList = !if(UpTo2D, (ins SReg_128_XNULL:$vaddr0, SReg_256_XNULL:$vaddr1, R128A16:$r128, CPol:$cpol),
+ let InOperandList = !if(UpTo2D, (ins SReg_128_XNULL:$vaddr0, SReg_256_XNULL:$vaddr1,
+ R128A16:$r128, CPol:$cpol, i1imm:$IsAsync),
(ins SReg_128_XNULL:$vaddr0, SReg_256_XNULL:$vaddr1, SReg_128_XNULL:$vaddr2,
- SReg_128_XNULL:$vaddr3, R128A16:$r128, CPol:$cpol));
+ SReg_128_XNULL:$vaddr3, R128A16:$r128, CPol:$cpol, i1imm:$IsAsync));
string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
}
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 0ce3807395839..db2f55552cb96 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -161,7 +161,8 @@ static constexpr VMEMID toVMEMID(MCRegUnit RU) {
DECL(VGPR_LDS_READ) /* read VGPR source in LDS */ \
DECL(VGPR_FLAT_READ) /* read VGPR source in FLAT */ \
DECL(VGPR_VMEM_READ) /* read VGPR source in other VMEM */ \
- DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */
+ DECL(ASYNC_ACCESS) /* access that uses ASYNC_CNT */ \
+ DECL(TENSOR_ACCESS) /* access that uses TENSOR_CNT */
// clang-format off
#define AMDGPU_EVENT_ENUM(Name) Name,
@@ -221,7 +222,7 @@ static const unsigned
AMDGPU::S_WAIT_EXPCNT, AMDGPU::S_WAIT_STORECNT,
AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT,
- AMDGPU::S_WAIT_ASYNCCNT};
+ AMDGPU::S_WAIT_ASYNCCNT, AMDGPU::S_WAIT_TENSORCNT};
// ASYNCMARK and WAIT_ASYNCMARK are meta instructions that emit no hardware
// code but still need to be processed by this pass for async vmcnt tracking.
@@ -425,8 +426,9 @@ class WaitcntGenerator {
// Returns a new waitcnt with all counters except VScnt set to 0. If
// IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
- // AsyncCnt always defaults to ~0u (don't wait for it). It is only updated
- // when a call to @llvm.amdgcn.wait.asyncmark() is processed.
+ // AsyncCnt and TensorCnt always default to ~0u (don't wait for it). They
+ // are only updated when a call to @llvm.amdgcn.wait.asyncmark() is
+ // processed.
virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
virtual ~WaitcntGenerator() = default;
@@ -446,6 +448,8 @@ class WaitcntGeneratorPreGFX12 final : public WaitcntGenerator {
WaitEventSet(),
WaitEventSet(),
WaitEventSet(),
+ WaitEventSet(),
+ WaitEventSet(),
WaitEventSet()};
public:
@@ -482,6 +486,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
WaitEventSet({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
WaitEventSet({VMEM_GROUP, SMEM_GROUP}),
WaitEventSet({ASYNC_ACCESS}),
+ WaitEventSet({TENSOR_ACCESS}),
WaitEventSet({VGPR_CSMACC_WRITE, VGPR_DPMACC_WRITE, VGPR_TRANS_WRITE,
VGPR_XDL_WRITE}),
WaitEventSet({VGPR_LDS_READ, VGPR_FLAT_READ, VGPR_VMEM_READ})};
@@ -673,8 +678,14 @@ class SIInsertWaitcnts {
return SIInstrInfo::mayWriteLDSThroughDMA(MI) && isAsync(MI);
}
+ bool isAsyncTensorDMA(const MachineInstr &MI) const {
+ return SIInstrInfo::usesTENSOR_CNT(MI) && isAsync(MI);
+ }
+
bool shouldUpdateAsyncMark(const MachineInstr &MI,
AMDGPU::InstCounterType T) const {
+ if (isAsyncTensorDMA(MI))
+ return T == AMDGPU::TENSOR_CNT;
if (!isAsyncLdsDmaWrite(MI))
return false;
if (SIInstrInfo::usesASYNC_CNT(MI))
@@ -1835,6 +1846,8 @@ counterTypeForInstr(unsigned Opcode) {
return AMDGPU::X_CNT;
case AMDGPU::S_WAIT_ASYNCCNT:
return AMDGPU::ASYNC_CNT;
+ case AMDGPU::S_WAIT_TENSORCNT:
+ return AMDGPU::TENSOR_CNT;
default:
return {};
}
@@ -2090,8 +2103,8 @@ AMDGPU::Waitcnt
WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
unsigned ExpertVal = IsExpertMode ? 0 : ~0u;
return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
- ~0u /* XCNT */, ~0u /* ASYNC_CNT */, ExpertVal,
- ExpertVal);
+ ~0u /* XCNT */, ~0u /* ASYNC_CNT */,
+ ~0u /* TENSOR_CNT */, ExpertVal, ExpertVal);
}
/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
@@ -3057,9 +3070,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (SIInstrInfo::usesASYNC_CNT(Inst)) {
ScoreBrackets->updateByEvent(ASYNC_ACCESS, Inst);
}
+ } else if (isAsyncTensorDMA(Inst)) {
+ ScoreBrackets->updateByEvent(TENSOR_ACCESS, Inst);
} else if (Inst.isCall()) {
- // Act as a wait on everything, but AsyncCnt is never included in such
- // blanket waits.
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/200775
More information about the cfe-commits
mailing list