[clang] [llvm] [AMDGPU] Use a general form of intrinsic for tensor load/store (PR #182334)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Feb 19 10:22:20 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
The intrinsic has five arguments for the tensor descriptor (D#), while the fifth one is reserved for future targets, and it will be silently ignored in codegen for gfx1250.
For tensor up to 2D, only the first two D# groups are meaningful and the rest should be zero-initialized.
---
Patch is 44.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182334.diff
14 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.td (+3-4)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl (+38-8)
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl (+3-5)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+7-24)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+35)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp (-21)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+44)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+1-8)
- (modified) llvm/lib/Target/AMDGPU/MIMGInstructions.td (-17)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll (+49-33)
- (modified) llvm/test/CodeGen/AMDGPU/waitcnt-debug-output-crash.ll (+1-1)
- (removed) llvm/test/Transforms/InstCombine/AMDGPU/tensor-load-store-lds.ll (-185)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 86b10eba55e8e..966a176a6882d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -752,10 +752,9 @@ def __builtin_amdgcn_global_store_async_from_lds_b128 : AMDGPUBuiltin<"void(_Ext
def __builtin_amdgcn_ds_atomic_async_barrier_arrive_b64 : AMDGPUBuiltin<"void(long int address_space<3> *)", [Const], "gfx1250-insts">;
def __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64 : AMDGPUBuiltin<"long int(long int address_space<3> *, long int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_load_to_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _Constant int)", [Const], "gfx1250-insts">;
-def __builtin_amdgcn_tensor_store_from_lds_d2 : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_load_to_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+def __builtin_amdgcn_tensor_store_from_lds : AMDGPUBuiltin<"void(_ExtVector<4, int>, _ExtVector<8, int>, _ExtVector<4, int>, _ExtVector<4, int>, _ExtVector<8, int>, _Constant int)", [Const], "gfx1250-insts">;
+
def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">;
def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
index 49ffbf4517160..cb106805d24bd 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
@@ -5,42 +5,72 @@
typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
+static v4i v4i_zeros = (v4i){0,0,0,0};
+static v8i v8i_zeros = (v8i){0,0,0,0,0,0,0,0};
+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
// CHECK-GFX1250-NEXT: entry:
-// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
- __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, v8i_zeros, 0);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
// CHECK-GFX1250-NEXT: entry:
-// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 27)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
{
- __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 27);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
// CHECK-GFX1250-NEXT: entry:
-// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> zeroinitializer, i32 22)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
- __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, v8i_zeros, 22);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
// CHECK-GFX1250-NEXT: entry:
-// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer, <8 x i32> zeroinitializer, i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
{
- __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, v4i_zeros, v4i_zeros, v8i_zeros, 0);
+}
+
+//=======================================================================
+// It is fine to pass 5 arguments as tensor descriptor, but the fifth one
+// will be ignored by llvm CodeGen for gfx1250, which only supports D# up
+// to 4 groups.
+//========================================================================
+
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d5(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d5(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], <8 x i32> [[SG4:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds_d5(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4)
+{
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, 0);
}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 8ab4f43d70c40..295707b53ed18 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -183,12 +183,10 @@ void test_amdgcn_async_load_store_lds_cpol(global char* gaddr8, global int *gadd
__builtin_amdgcn_global_store_async_from_lds_b128(gaddr128, laddr128, 16, cpol); // expected-error {{'__builtin_amdgcn_global_store_async_from_lds_b128' must be a constant integer}}
}
-void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
+void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, v8i sg4, int cpol)
{
- __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
- __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
- __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
- __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, sg4, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
}
void test_prefetch(generic void *fptr, global void *gptr, int cpol) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 66591519de73e..9101666c2a49c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -4194,41 +4194,24 @@ def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse<llvm
def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdxClamp<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
}
-
class AMDGPUTensorLoadStore:
Intrinsic<
[],
[llvm_v4i32_ty, // D# group 0
llvm_v8i32_ty, // D# group 1
- llvm_v4i32_ty, // D# group 2
- llvm_v4i32_ty, // D# group 3
+ llvm_v4i32_ty, // D# group 2: group 2 and 3 should be zero-initialized for D# up to 2D.
+ llvm_v4i32_ty, // D# group 3:
+ llvm_v8i32_ty, // D# group 4: reserved for future targets, use <8 x i32> zeroinitializer for now.
+ // This argument will be silently ignored.
llvm_i32_ty], // cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
- [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
- "", [SDNPMemOperand]
- >;
-
-class AMDGPUTensorLoadStoreD2:
- Intrinsic<
- [],
- [llvm_v4i32_ty, // D# group 0
- llvm_v8i32_ty, // D# group 1
- llvm_i32_ty], // cachepolicy:
- // bits [0-2] = th
- // bits [3-4] = scope
- [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;
-def int_amdgcn_tensor_load_to_lds :
- ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
-def int_amdgcn_tensor_store_from_lds :
- ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
-def int_amdgcn_tensor_load_to_lds_d2 :
- ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds_d2">, AMDGPUTensorLoadStoreD2;
-def int_amdgcn_tensor_store_from_lds_d2 :
- ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
+def int_amdgcn_tensor_load_to_lds : ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_store_from_lds : ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
class AMDGPUClusterLoad<LLVMType ptr_ty>:
Intrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 238f06fbd33c0..b0a26495c014d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3005,6 +3005,37 @@ void AMDGPUDAGToDAGISel::SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID) {
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
+void AMDGPUDAGToDAGISel::SelectTensorLoadStore(SDNode *N, unsigned IntrID) {
+ bool IsLoad = IntrID == Intrinsic::amdgcn_tensor_load_to_lds;
+ unsigned Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS :
+ AMDGPU::TENSOR_STORE_FROM_LDS;
+
+ SmallVector<SDValue, 7> TensorOps;
+ // First two groups
+ TensorOps.push_back(N->getOperand(2)); // D# group 0
+ TensorOps.push_back(N->getOperand(3)); // D# group 1
+
+ // Use _D2 version if both group 2 and 3 are zero-initialized.
+ SDValue Group2 = N->getOperand(4);
+ SDValue Group3 = N->getOperand(5);
+ if (ISD::isBuildVectorAllZeros(Group2.getNode()) &&
+ ISD::isBuildVectorAllZeros(Group3.getNode())) {
+ Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2 :
+ AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+ } else { // Has at least 4 groups
+ TensorOps.push_back(Group2); // D# group 2
+ TensorOps.push_back(Group3); // D# group 3
+ }
+
+ // TODO: Handle the fifth group: N->getOperand(6), which is silently ignored
+ // for now because all existing targets only support up to 4 groups.
+ TensorOps.push_back(CurDAG->getTargetConstant(0, SDLoc(N), MVT::i1)); // r128
+ TensorOps.push_back(N->getOperand(7)); // cache policy
+ TensorOps.push_back(N->getOperand(0)); // chain
+
+ (void)CurDAG->SelectNodeTo(N, Opc, MVT::Other, TensorOps);
+}
+
static unsigned gwsIntrinToOpcode(unsigned IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_ds_gws_init:
@@ -3287,6 +3318,10 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
case Intrinsic::amdgcn_ds_gws_sema_release_all:
SelectDS_GWS(N, IntrID);
return;
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds:
+ SelectTensorLoadStore(N, IntrID);
+ return;
default:
break;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a86b75458923e..ffeb6dfdb3f90 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -285,6 +285,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N, unsigned IntrID);
+ void SelectTensorLoadStore(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectInterpP1F16(SDNode *N);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 0ebe69de56fa9..02879a7bba897 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1821,27 +1821,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
NewII->takeName(&II);
return IC.replaceInstUsesWith(II, NewII);
}
- case Intrinsic::amdgcn_tensor_load_to_lds:
- case Intrinsic::amdgcn_tensor_store_from_lds: {
- Value *D2 = II.getArgOperand(2);
- Value *D3 = II.getArgOperand(3);
- // We know that not passing the second and third tensor DMA groups is
- // equivalent to passing zeroes for those registers, so we rewrite to the
- // shorter form here. Undef or poison are replaced by 0.
- auto Pred = m_CombineOr(m_Zero(), m_Undef());
- if (!match(D2, Pred) || !match(D3, Pred))
- return std::nullopt;
-
- auto ShortIntrinsic = IID == Intrinsic::amdgcn_tensor_load_to_lds
- ? Intrinsic::amdgcn_tensor_load_to_lds_d2
- : Intrinsic::amdgcn_tensor_store_from_lds_d2;
- CallInst *NewII = IC.Builder.CreateIntrinsic(
- ShortIntrinsic,
- {II.getArgOperand(0), II.getArgOperand(1), II.getArgOperand(4)});
- NewII->takeName(&II);
- NewII->copyMetadata(II);
- return IC.eraseInstFromFunction(II);
- }
case Intrinsic::amdgcn_wave_shuffle: {
if (!ST->hasDPP())
return std::nullopt;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7f913cfca5d7c..14c50a52b08a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2393,6 +2393,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_global_load_lds:
case Intrinsic::amdgcn_global_load_async_lds:
return selectGlobalLoadLds(I);
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds:
+ return selectTensorLoadStore(I, IntrinsicID);
case Intrinsic::amdgcn_asyncmark:
case Intrinsic::amdgcn_wait_asyncmark:
// FIXME: Not supported on GFX12 yet. Will need a new feature when we do.
@@ -3787,6 +3790,47 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
return true;
}
+bool AMDGPUInstructionSelector::selectTensorLoadStore(MachineInstr &MI,
+ Intrinsic::ID IID) const {
+ bool IsLoad = IID == Intrinsic::amdgcn_tensor_load_to_lds;
+ unsigned Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS :
+ AMDGPU::TENSOR_STORE_FROM_LDS;
+ int NumGroups = 4;
+
+ // A lamda function to check whether an operand is a vector of all 0s.
+ const auto isAllZeros = [&](MachineOperand &Opnd) {
+ const MachineInstr *DefMI = MRI->getVRegDef(Opnd.getReg());
+ if (!DefMI)
+ return false;
+ return llvm::isBuildVectorAllZeros(*DefMI, *MRI, true);
+ };
+
+ // Use _D2 version if both group 2 and 3 are zero-initialized.
+ if (isAllZeros(MI.getOperand(3)) && isAllZeros(MI.getOperand(4))) {
+ NumGroups = 2;
+ Opc = IsLoad ? AMDGPU::TENSOR_LOAD_TO_LDS_D2 :
+ AMDGPU::TENSOR_STORE_FROM_LDS_D2;
+ }
+
+ // TODO: Handle the fifth group: MI.getOpetand(5), which is silently ignored
+ // for now because all existing targets only support up to 4 groups.
+ MachineBasicBlock *MBB = MI.getParent();
+ auto MIB = BuildMI(*MBB, &MI, MI.getDebugLoc(), TII.get(Opc))
+ .add(MI.getOperand(1)) // D# group 0
+ .add(MI.getOperand(2)); // D# group 1
+
+ if (NumGroups >= 4) { // Has at least 4 groups
+ MIB.add(MI.getOperand(3)) // D# group 2
+ .add(MI.getOperand(4)); // D# group 3
+ }
+
+ MIB.addImm(0) // r128
+ .add(MI.getOperand(6)); // cpol
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectBVHIntersectRayIntrinsic(
MachineInstr &MI) const {
unsigned OpcodeOpIdx =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 627cce277ae38..98c4e7837a1ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -145,6 +145,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
bool selectBufferLoadLds(MachineInstr &MI) const;
bool selectGlobalLoadLds(MachineInstr &MI) const;
+ bool selectTensorLoadStore(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectBVHIntersectRayIntrinsic(MachineInstr &I) const;
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectPermlaneSwapIntrin(MachineInstr &I, Intrinsic::ID IntrID) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index e8f316d332321..7e047278fe78f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3388,12 +3388,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 2);
constrainOpWithReadfirstlane(B, MI, 3);
constrainOpWithReadfirstlane(B, MI, 4);
- return;
- }
- case Intrinsic::amdgcn_tensor_load_to_lds_d2:
- case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
- constrainOpWithReadfirstlane(B, MI, 1);
- constrainOpWithReadfirstlane(B, MI, 2);
+ constrainOpWithReadfirstlane(B, MI, 5);
return;
}
default: {
@@ -5636,8 +5631,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
- case Intrinsic::amdgcn_tensor_load_to_lds_d2:
- case Intrinsic::amdgcn_tensor_store_from_lds_d2:
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds: {
// Lie and claim everything is legal, even all operands need to be
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index b023c96296b2c..0521e199c31dd 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions....
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/182334
More information about the cfe-commits
mailing list