[Mlir-commits] [mlir] 5f15fee - [mlir][amdgpu] Add tensor load store operations (#172686)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Dec 17 09:37:32 PST 2025
Author: Erick Ochoa Lopez
Date: 2025-12-17T12:37:27-05:00
New Revision: 5f15fee8acc7817f3c77f3060ce06a38ae53a910
URL: https://github.com/llvm/llvm-project/commit/5f15fee8acc7817f3c77f3060ce06a38ae53a910
DIFF: https://github.com/llvm/llvm-project/commit/5f15fee8acc7817f3c77f3060ce06a38ae53a910.diff
LOG: [mlir][amdgpu] Add tensor load store operations (#172686)
Reland https://github.com/llvm/llvm-project/pull/170918
This PR differs from the original one by making the target
materialization more restrictive.
Added:
Modified:
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 96f5f5c6f1a3f..4865dc13f324b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -1587,4 +1587,35 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor">
}
+def AMDGPU_TensorLoadToLDSOp :
+ AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite, MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+ let summary = "Load tensors from global memory to LDS.";
+ let description = [{
+ Load tensors of up to five dimensions from global memory to LDS.
+
+ This operation was introduced in gfx1250.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
+def AMDGPU_TensorStoreFromLDSOp :
+ AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite, MemRead]>]>,
+ Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
+
+ let summary = "Store tensors from LDS to global memory.";
+ let description = [{
+ Store tensors of up to five dimensions from LDS to global memory.
+
+ This operation was introduced in gfx1250.
+ }];
+
+ let assemblyFormat = [{
+ $desc attr-dict `:` qualified(type($desc))
+ }];
+}
+
#endif // AMDGPU
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 541bb02d79eae..90009c9722fe3 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -3218,11 +3218,6 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern<DescriptorOp> {
Location loc = op.getLoc();
- IntegerType i32 = rewriter.getI32Type();
- [[maybe_unused]] Type v4i32 =
- this->typeConverter->convertType(VectorType::get(4, i32));
- assert(v4i32 && "expected type conversion to succeed");
-
SmallVector<Value> consts;
for (int64_t i = 0; i < 8; ++i)
consts.push_back(createI32Constant(rewriter, loc, i));
@@ -3237,6 +3232,32 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern<DescriptorOp> {
}
};
+template <typename SourceOp, typename TargetOp>
+struct AMDGPUTensorLoadStoreOpLowering
+ : public ConvertOpToLLVMPattern<SourceOp> {
+ using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
+ using Adaptor = typename ConvertOpToLLVMPattern<SourceOp>::OneToNOpAdaptor;
+ AMDGPUTensorLoadStoreOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<SourceOp>(converter), chipset(chipset) {}
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(SourceOp op, Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset < kGfx1250)
+ return op->emitOpError("is only supported on gfx1250");
+
+ ValueRange desc = adaptor.getDesc();
+ rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
+ desc[3], /*cachePolicy=*/0,
+ /*alias_scopes=*/nullptr,
+ /*noalias_scopes=*/nullptr,
+ /*tbaa=*/nullptr);
+ return success();
+ }
+};
+
struct ConvertAMDGPUToROCDLPass
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
using Base::Base;
@@ -3306,6 +3327,33 @@ void mlir::populateAMDGPUTypeAndAttributeConversions(
Type i32 = IntegerType::get(type.getContext(), 32);
return typeConverter.convertType(VectorType::get(4, i32));
});
+ typeConverter.addConversion(
+ [&](TDMDescriptorType type,
+ SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
+ Type i32 = IntegerType::get(type.getContext(), 32);
+ Type v4i32 = typeConverter.convertType(VectorType::get(4, i32));
+ Type v8i32 = typeConverter.convertType(VectorType::get(8, i32));
+ llvm::append_values(result, v4i32, v8i32, v4i32, v4i32);
+ return success();
+ });
+
+ auto addUnrealizedCast = [](OpBuilder &builder, TypeRange types,
+ ValueRange inputs,
+ Location loc) -> SmallVector<Value> {
+ // Only create unrealized_conversion_cast for TDMDescriptorType.
+ // All other types which are not expected, should be
+ // materialized by other target materialization functions.
+ if (inputs.size() != 1)
+ return {};
+
+ if (!isa<TDMDescriptorType>(inputs[0].getType()))
+ return {};
+
+ auto cast = UnrealizedConversionCastOp::create(builder, loc, types, inputs);
+ return cast.getResults();
+ };
+
+ typeConverter.addTargetMaterialization(addUnrealizedCast);
}
void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
@@ -3336,7 +3384,11 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
AMDGPULowerDescriptor<MakeDmaDescriptorOp>,
- AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>>(converter,
- chipset);
+ AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>,
+ AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
+ ROCDL::TensorLoadToLDSOp>,
+ AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
+ ROCDL::TensorStoreFromLDSOp>>(
+ converter, chipset);
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
index 4979e85785970..e62db9ff571bf 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir
@@ -773,6 +773,24 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
func.return %descriptor : !amdgpu.tdm_descriptor
}
+// CHECK-LABEL: func @tensor_load_to_lds
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
+func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) {
+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor
+ func.return
+}
+
+// CHECK-LABEL: func @tensor_store_from_lds
+// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
+func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) {
+ // CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
+ // CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
+ amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor
+ func.return
+}
+
// -----
// CHECK-LABEL: func @make_gather_dma_descriptor
More information about the Mlir-commits
mailing list