[Mlir-commits] [mlir] [MLIR][NVVM] Add TMA prefetch Op (PR #141211)
Durgadoss R
llvmlistbot at llvm.org
Fri May 23 01:42:22 PDT 2025
https://github.com/durga4github created https://github.com/llvm/llvm-project/pull/141211
This patch adds an Op for the TMA prefetch
(non-tensor) variant. llvm-lit tests are added
to verify the lowering to the intrinsics.
>From 490b5260786e7c4558b7da7f07df12ae8081f20f Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr at nvidia.com>
Date: Thu, 22 May 2025 19:15:56 +0530
Subject: [PATCH] [MLIR][NVVM] Add TMA prefetch Op
This patch adds an Op for the TMA
prefetch (non-tensor) variant.
Signed-off-by: Durgadoss R <durgadossr at nvidia.com>
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 43 +++++++++++++++++++
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 20 +++++++++
.../test/Target/LLVMIR/nvvm/tma_prefetch.mlir | 9 ++++
3 files changed, 72 insertions(+)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 0c5c87cfe002f..c6c8f59db8c0d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2344,6 +2344,49 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
}];
}
+def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
+ let summary = "Async bulk prefetch from global memory to L2 cache";
+ let description = [{
+ Initiates an asynchronous prefetch of data from the location
+ specified by `srcMem` to the L2 cache.
+
+ The `l2CacheHint` operand is optional, and it is used to specify cache
+ eviction policy that may be used during the memory access.
+
+ Example:
+ ```mlir
+ nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
+
+ // with l2_cache_hint
+ nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>
+ ```
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch)
+ }];
+
+ let arguments = (ins
+ LLVM_PointerGlobal:$srcMem,
+ I32:$size,
+ Optional<I64>:$l2CacheHint);
+
+ let assemblyFormat = [{
+ $srcMem `,` $size (`l2_cache_hint` `=` $l2CacheHint^ )?
+ attr-dict `:` type($srcMem)
+ }];
+
+ let extraClassDeclaration = [{
+ static mlir::NVVM::IDArgPair
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::IRBuilderBase& builder);
+ }];
+
+ string llvmBuilder = [{
+ auto [id, args] = NVVM::CpAsyncBulkPrefetchOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, builder);
+ createIntrinsicCall(builder, id, args);
+ }];
+}
+
def NVVM_CpAsyncBulkTensorPrefetchOp :
NVVM_Op<"cp.async.bulk.tensor.prefetch", [AttrSizedOperandSegments]> {
let arguments = (ins
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 9f55fe315106c..ad98dfc59e029 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1254,6 +1254,26 @@ CpAsyncOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
return id;
}
+mlir::NVVM::IDArgPair CpAsyncBulkPrefetchOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::CpAsyncBulkPrefetchOp>(op);
+ llvm::SmallVector<llvm::Value *> args;
+ llvm::Intrinsic::ID id = llvm::Intrinsic::nvvm_cp_async_bulk_prefetch_L2;
+
+ // Fill the Intrinsic Args
+ args.push_back(mt.lookupValue(thisOp.getSrcMem()));
+ args.push_back(mt.lookupValue(thisOp.getSize()));
+
+ mlir::Value cacheHint = thisOp.getL2CacheHint();
+ const bool hasCacheHint = static_cast<bool>(cacheHint);
+ llvm::Value *i64Unused =
+ llvm::ConstantInt::get(llvm::Type::getInt64Ty(mt.getLLVMContext()), 0);
+ args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Unused);
+ args.push_back(builder.getInt1(hasCacheHint));
+
+ return {id, std::move(args)};
+}
+
mlir::NVVM::IDArgPair CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::CpAsyncBulkSharedCTAToGlobalOp>(op);
diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
index f1fa3b61f2dd9..bfd952636ffbe 100644
--- a/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/tma_prefetch.mlir
@@ -1,5 +1,14 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+// CHECK-LABEL: @tma_bulk_prefetch
+llvm.func @tma_bulk_prefetch(%src : !llvm.ptr<1>, %size : i32, %ch : i64) {
+ // CHECK: call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %{{.*}}, i32 %{{.*}}, i64 0, i1 false)
+ // CHECK: call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %{{.*}}, i32 %{{.*}}, i64 %{{.*}}, i1 true)
+ nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
+ nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>
+ llvm.return
+}
+
// CHECK-LABEL: @tma_prefetch_1d
llvm.func @tma_prefetch_1d(%tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {
// CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %0, i32 %{{.*}}, i64 0, i1 false)
More information about the Mlir-commits
mailing list