[clang] [llvm] AMDGPU: Implement tensor load and store instructions for gfx1250 (PR #146636)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 1 22:46:31 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
---
Patch is 42.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146636.diff
20 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+5)
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+26)
- (added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl (+46)
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+11)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+31)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+30)
- (modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+23-6)
- (modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+3-7)
- (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (+94)
- (modified) llvm/lib/Target/AMDGPU/SIDefines.h (+1-2)
- (modified) llvm/lib/Target/AMDGPU/SIInstrFormats.td (+4-2)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+13)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+1)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+1)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+24)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+7)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll (+245)
- (added) llvm/test/MC/AMDGPU/gfx1250_asm_vimage.s (+34)
- (added) llvm/test/MC/AMDGPU/gfx1250_asm_vimage_err.s (+25)
- (added) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vimage.txt (+25)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5133947c498ca..fb358297a5eed 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index f09b3b92c4ea0..1fc2d57d4941c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -621,6 +621,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
+ IID = Intrinsic::amdgcn_tensor_load_to_lds;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
+ IID = Intrinsic::amdgcn_tensor_load_to_lds_d2;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
+ IID = Intrinsic::amdgcn_tensor_store_from_lds;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2:
+ IID = Intrinsic::amdgcn_tensor_store_from_lds_d2;
+ break;
+ }
+
+ SmallVector<Value *, 5> Args;
+ for (int i = 0, e = E->getNumArgs(); i != e; ++i)
+ Args.push_back(EmitScalarExpr(E->getArg(i)));
+ llvm::Function *F = CGM.getIntrinsic(IID, {});
+ return Builder.CreateCall(F, {Args});
+ }
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
return emitBuiltinWithOneOverloadedType<5>(*this, E,
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
new file mode 100644
index 0000000000000..49ffbf4517160
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 69857087bae08..3ba0d50e79031 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,6 +1,9 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
@@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}
+
+void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a0a81568424f5..2aabf6109022f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3580,6 +3580,37 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;
+class AMDGPUTensorLoadStore:
+ Intrinsic<
+ [],
+ [llvm_v4i32_ty, // D# group 0
+ llvm_v8i32_ty, // D# group 1
+ llvm_v4i32_ty, // D# group 2
+ llvm_v4i32_ty, // D# group 3
+ llvm_i32_ty], // cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
+class AMDGPUTensorLoadStoreD2:
+ Intrinsic<
+ [],
+ [llvm_v4i32_ty, // D# group 0
+ llvm_v8i32_ty, // D# group 1
+ llvm_i32_ty], // cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_tensor_load_to_lds : AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_store_from_lds : AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_load_to_lds_d2 : AMDGPUTensorLoadStoreD2;
+def int_amdgcn_tensor_store_from_lds_d2 : AMDGPUTensorLoadStoreD2;
+
/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6f6d7b8d99af5..353fb23fa1520 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ constrainOpWithReadfirstlane(B, MI, 3);
+ constrainOpWithReadfirstlane(B, MI, 4);
+ return;
+ }
+ case Intrinsic::amdgcn_tensor_load_to_lds_d2:
+ case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
+ case Intrinsic::amdgcn_tensor_load_to_lds_d2:
+ case Intrinsic::amdgcn_tensor_store_from_lds_d2:
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ // Lie and claim everything is legal, even all operands need to be
+ // SGPRs. applyMapping will have to deal with it with readfirstlane.
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ if (MI.getOperand(I).isReg()) {
+ Register Reg = MI.getOperand(I).getReg();
+ auto OpBank = getRegBankID(Reg, MRI);
+ unsigned Size = getSizeInBits(Reg, MRI, *TRI);
+ OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
+ }
+ }
+ break;
+ }
case Intrinsic::amdgcn_s_prefetch_data: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 14fec71996a0e..c429e95f52a9d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
+ bool validateTensorR128(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
bool validateTrue16OpSel(const MCInst &Inst);
@@ -4280,6 +4281,20 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0)
+ return true;
+
+ int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+ if (R128Idx >= 0 && Inst.getOperand(R128Idx).getImm())
+ return false;
+
+ return true;
+}
+
static bool IsRevOpcode(const unsigned Opcode)
{
switch (Opcode) {
@@ -5113,14 +5128,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
return PrintError("scope and th combination is not valid");
}
- bool IsStore = TID.mayStore();
- bool IsAtomic =
- TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
-
- if (IsAtomic) {
+ unsigned THType = AMDGPU::getTemporalHintType(TID);
+ if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
return PrintError("invalid th value for atomic instructions");
- } else if (IsStore) {
+ } else if (THType == AMDGPU::CPol::TH_TYPE_STORE) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
return PrintError("invalid th value for store instructions");
} else {
@@ -5205,6 +5217,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "missing dim operand");
return false;
}
+ if (!validateTensorR128(Inst)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
+ "instruction must set modifier r128=0");
+ return false;
+ }
if (!validateMIMGMSAA(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
"invalid dim; must be MSAA type");
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a6ce42dca92be..fa1474d153834 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
const unsigned Opcode = MI->getOpcode();
const MCInstrDesc &TID = MII.get(Opcode);
- bool IsStore = TID.mayStore();
- bool IsAtomic =
- TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
+ unsigned THType = AMDGPU::getTemporalHintType(TID);
+ bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE);
O << " th:";
- if (IsAtomic) {
+ if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
O << "TH_ATOMIC_";
if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
if (Scope >= AMDGPU::CPol::SCOPE_DEV)
@@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
O << formatHex(TH);
else {
- // This will default to printing load variants when neither MayStore nor
- // MayLoad flag is present which is the case with instructions like
- // image_get_resinfo.
O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
switch (TH) {
case AMDGPU::CPol::TH_NT:
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 8d94d73bc1aab..531fae3ceff59 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;
+
+//===----------------------------------------------------------------------===//
+// VIMAGE Tensor Instructions
+//===----------------------------------------------------------------------===//
+
+class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
+ InstSI<(outs ), (ins ), "", []>,
+ SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ string Mnemonic = opName;
+
+ let VALU = 1;
+ let maybeAtomic = 0;
+ let TENSOR_CNT = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let Uses = [EXEC, TENSORcnt];
+ let Defs = [TENSORcnt];
+ let SchedRW = [WriteVMEM, WriteLDS];
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0;
+
+ bit UpTo2D = _UpTo2D;
+ let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol),
+ (ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2,
+ SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol));
+ string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
+}
+
+let SubtargetPredicate = isGFX1250Plus in {
+def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
+def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
+def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
+def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
+} // End SubtargetPredicate = isGFX1250Plus.
+
+class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
+ (inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol)
+>;
+
+class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)),
+ (inst $vaddr0, $vaddr1, 0, $cpol)
+>;
+
+let SubtargetPredicate = isGFX1250Plus in {
+def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
+def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
+def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
+def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
+}
+
+class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
+ VIMAGEe<op> {
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let TSFlags = ps.TSFlags;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+
+ // D# group 2 and 3 set to NULL for 2D or less.
+ let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+ let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+
+ // set to 0 based on SPG.
+ let vaddr4 = 0;
+ let rsrc = 0;
+ let vdata = 0;
+ let d16 = 0;
+ let a16 = 0;
+ let tfe = 0;
+ let dmask = 1; // sp3
+ let dim = 1; // sp3
+}
+
+multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
+ let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+ foreach DSuffix = ["_D2", ""] in {
+ defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
+ def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
+ }
+ }
+}
+
+defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>;
+defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 4b72f66abbd76..76e29e4393206 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -107,8 +107,7 @@ enum : uint64_t {
DisableWQM = UINT64_C(1) << 36,
Gather4 = UINT64_C(1) << 37,
- // Reserved, must be 0.
- Reserved0 = UINT64_C(1) << 38,
+ TENSOR_CNT = UINT64_C(1) << 38,
SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 42aae35112cac..c27d4e0df6fc5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -68,6 +68,9 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit Gather4 = 0;
+ // wait count to manage tensor loads/stores.
+ field bit TENSOR_CNT = 0;
+
// This is an s_store_dword* instruction that requires a cache flush
// on wave termination. It is necessary to distinguish from mayStore
// SMEM instructions like the cache flush ones.
@@ -201,8 +204,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{36} = DisableWQM;
let TSFlags{37} = Gather4;
- // Reserved, must be 0.
- let TSFlags{38} = 0;
+ let TSFlags{38} = TENSOR_CNT;
let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/li...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/146636
More information about the llvm-commits
mailing list