[Mlir-commits] [mlir] [MLIR][NVVM] Add tcgen05 alloc/dealloc Ops (PR #125674)
Durgadoss R
llvmlistbot at llvm.org
Tue Feb 4 06:11:35 PST 2025
https://github.com/durga4github updated https://github.com/llvm/llvm-project/pull/125674
>From 8dae72f1196528c3240798ba046fcf8130ba384c Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr at nvidia.com>
Date: Sat, 1 Feb 2025 18:38:34 +0530
Subject: [PATCH] [MLIR][NVVM] Add tcgen05 alloc/dealloc Ops
PR #124961 adds intrinsics for the tcgen05
alloc/dealloc PTX instructions. This patch
adds NVVM Ops for the same.
Tests are added to verify the lowering to
the corresponding intrinsics in tcgen05-alloc.mlir file.
Signed-off-by: Durgadoss R <durgadossr at nvidia.com>
---
.../include/mlir/Dialect/LLVMIR/NVVMDialect.h | 6 +-
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 105 ++++++++++++++++++
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 41 +++++++
.../Target/LLVMIR/nvvm/tcgen05-alloc.mlir | 42 +++++++
4 files changed, 193 insertions(+), 1 deletion(-)
create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index d474ba8485d5d8..a9270c6f52344f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -39,7 +39,11 @@ enum NVVMMemorySpace {
/// Shared memory space identifier.
kSharedMemorySpace = 3,
/// Constant memory space identifier.
- kConstantMemorySpace = 4
+ kConstantMemorySpace = 4,
+ /// Tensor memory space identifier.
+ /// Tensor memory is available only in arch-accelerated
+ /// variants from sm100 onwards.
+ kTensorMemorySpace = 6
};
/// Return the element type and number of elements associated with a wmma matrix
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 23db9375fbffe2..c501b5e7c10015 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/InferIntRangeInterface.td"
def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
+def LLVM_PointerTensor : LLVM_PointerInAddressSpace<6>;
//===----------------------------------------------------------------------===//
// NVVM dialect definitions
@@ -2592,6 +2593,110 @@ def NVVM_Breakpoint : NVVM_Op<"breakpoint"> {
let assemblyFormat = "attr-dict";
}
+//===----------------------------------------------------------------------===//
+// NVVM TCGEN05 Ops
+//===----------------------------------------------------------------------===//
+// Num CTAs in a group participating in the TCGEN05 operation.
+// This corresponds to the "cta_group::1", "cta_group::2"
+// modifiers in the PTX instructions.
+def Tcgen05GroupCTA_1 : I32EnumAttrCase<"CTA_1", 0, "cta_1">;
+def Tcgen05GroupCTA_2 : I32EnumAttrCase<"CTA_2", 1, "cta_2">;
+
+def Tcgen05GroupKind : I32EnumAttr<"Tcgen05GroupKind",
+ "NVVM Tcgen05 group kind",
+ [Tcgen05GroupCTA_1, Tcgen05GroupCTA_2]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+def Tcgen05GroupKindAttr :
+ EnumAttr<NVVM_Dialect, Tcgen05GroupKind, "tcgen05_group"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_Tcgen05AllocOp : NVVM_Op<"tcgen05.alloc"> {
+ let summary = "Tcgen05 alloc operation";
+ let description = [{
+ The `tcgen05.alloc` Op allocates tensor core memory for
+ the amount specified by `nCols` and writes the destination
+ address to the `addr` argument. The `nCols` operand specifies the
+ number of columns to be allocated and it must be a power-of-two.
+ [For more information, refer to the PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+ }];
+
+ let arguments = (ins
+ AnyTypeOf<[LLVM_AnyPointer, LLVM_PointerShared]>:$addr,
+ I32:$nCols,
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+ let assemblyFormat = "$addr `,` $nCols attr-dict `:` type(operands)";
+
+ let extraClassDeclaration = [{
+ static llvm::Intrinsic::ID
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::SmallVector<llvm::Value *> &args);
+ }];
+ string llvmBuilder = [{
+ llvm::SmallVector<llvm::Value *> args;
+ auto id = NVVM::Tcgen05AllocOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, args);
+ createIntrinsicCall(builder, id, args);
+ }];
+}
+
+def NVVM_Tcgen05DeallocOp : NVVM_Op<"tcgen05.dealloc"> {
+ let summary = "Tcgen05 dealloc operation";
+ let description = [{
+ The `tcgen05.dealloc` Op de-allocates the tensor core memory
+ specified by `tmemAddr`, which must be from a previous tensor
+ memory allocation. The `nCols` operand specifies the number
+ of columns to be de-allocated, and it must be a power-of-two.
+ [For more information, refer to the PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+ }];
+
+ let arguments = (ins LLVM_PointerTensor:$taddr, I32:$nCols,
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+ let assemblyFormat = "$taddr `,` $nCols attr-dict `:` type(operands)";
+
+ let extraClassDeclaration = [{
+ static llvm::Intrinsic::ID
+ getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::SmallVector<llvm::Value *> &args);
+ }];
+ string llvmBuilder = [{
+ llvm::SmallVector<llvm::Value *> args;
+ auto id = NVVM::Tcgen05DeallocOp::getIntrinsicIDAndArgs(
+ *op, moduleTranslation, args);
+ createIntrinsicCall(builder, id, args);
+ }];
+}
+
+def NVVM_Tcgen05RelinquishAllocPermitOp : NVVM_Op<"tcgen05.relinquish_alloc_permit"> {
+ let summary = "Tcgen05 Op to relinquish the right to allocate";
+ let description = [{
+ The `tcgen05.relinquish_alloc_permit` Op specifies that the CTA
+ of the executing thread is relinquishing the right to allocate
+ Tensor Memory. So, it is illegal for a CTA to perform `tcgen05.alloc`
+ after any of its constituent threads execute `tcgen05.relinquish_alloc_permit`.
+ [For more information, refer to the PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-memory-alloc-manage-instructions)
+ }];
+
+ let arguments = (ins
+ DefaultValuedAttr<Tcgen05GroupKindAttr, "Tcgen05GroupKind::CTA_1">:$group);
+
+ let assemblyFormat = "attr-dict";
+
+ string llvmBuilder = [{
+ auto id = ($group == NVVM::Tcgen05GroupKind::CTA_1) ?
+ llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg1 :
+ llvm::Intrinsic::nvvm_tcgen05_relinq_alloc_permit_cg2;
+ createIntrinsicCall(builder, id);
+ }];
+}
+
//===----------------------------------------------------------------------===//
// NVVM target attribute.
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index a5d09eaa34eb54..241b25c6caf128 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1243,6 +1243,47 @@ llvm::Intrinsic::ID CvtFloatToTF32Op::getIntrinsicID(NVVM::FPRoundingMode rnd,
}
}
+llvm::Intrinsic::ID
+Tcgen05AllocOp::getIntrinsicIDAndArgs(Operation &op,
+ LLVM::ModuleTranslation &mt,
+ llvm::SmallVector<llvm::Value *> &args) {
+ auto curOp = cast<NVVM::Tcgen05AllocOp>(op);
+ unsigned AS = llvm::cast<LLVM::LLVMPointerType>(curOp.getAddr().getType())
+ .getAddressSpace();
+ bool isShared = AS == NVVMMemorySpace::kSharedMemorySpace;
+ bool is2CTAMode = curOp.getGroup() == Tcgen05GroupKind::CTA_2;
+
+ llvm::Intrinsic::ID id;
+ if (isShared) {
+ id = is2CTAMode ? llvm::Intrinsic::nvvm_tcgen05_alloc_shared_cg2
+ : llvm::Intrinsic::nvvm_tcgen05_alloc_shared_cg1;
+ } else {
+ id = is2CTAMode ? llvm::Intrinsic::nvvm_tcgen05_alloc_cg2
+ : llvm::Intrinsic::nvvm_tcgen05_alloc_cg1;
+ }
+
+ // Fill the Intrinsic Args
+ args.push_back(mt.lookupValue(curOp.getAddr()));
+ args.push_back(mt.lookupValue(curOp.getNCols()));
+
+ return id;
+}
+
+llvm::Intrinsic::ID Tcgen05DeallocOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt,
+ llvm::SmallVector<llvm::Value *> &args) {
+ auto curOp = cast<NVVM::Tcgen05DeallocOp>(op);
+ auto id = (curOp.getGroup() == Tcgen05GroupKind::CTA_1)
+ ? llvm::Intrinsic::nvvm_tcgen05_dealloc_cg1
+ : llvm::Intrinsic::nvvm_tcgen05_dealloc_cg2;
+
+ // Fill the Intrinsic Args
+ args.push_back(mt.lookupValue(curOp.getTaddr()));
+ args.push_back(mt.lookupValue(curOp.getNCols()));
+
+ return id;
+}
+
/// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might
/// have ConstantRangeAttr.
static void nvvmInferResultRanges(Operation *op, Value result,
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir
new file mode 100644
index 00000000000000..781efa25671111
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-alloc.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-LLVM
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc
+llvm.func @llvm_nvvm_tcgen05_alloc(%addr : !llvm.ptr, %ncols : i32) {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.cg1(ptr %{{.*}}, i32 %{{.*}})
+ nvvm.tcgen05.alloc %addr, %ncols : !llvm.ptr, i32
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.cg2(ptr %{{.*}}, i32 %{{.*}})
+ nvvm.tcgen05.alloc %addr, %ncols {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr, i32
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_alloc_shared
+llvm.func @llvm_nvvm_tcgen05_alloc_shared(%addr : !llvm.ptr<3>, %ncols : i32) {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.shared.cg1(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+ nvvm.tcgen05.alloc %addr, %ncols : !llvm.ptr<3>, i32
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.alloc.shared.cg2(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+ nvvm.tcgen05.alloc %addr, %ncols {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr<3>, i32
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_dealloc
+llvm.func @llvm_nvvm_tcgen05_dealloc(%addr : !llvm.ptr<6>, %ncols : i32) {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.dealloc.cg1(ptr addrspace(6) %{{.*}}, i32 %{{.*}})
+ nvvm.tcgen05.dealloc %addr, %ncols : !llvm.ptr<6>, i32
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.dealloc.cg2(ptr addrspace(6) %{{.*}}, i32 %{{.*}})
+ nvvm.tcgen05.dealloc %addr, %ncols {group = #nvvm.tcgen05_group<cta_2>} : !llvm.ptr<6>, i32
+ llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_tcgen05_relinquish_alloc_permit
+llvm.func @llvm_nvvm_tcgen05_relinquish_alloc_permit() {
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg1()
+ nvvm.tcgen05.relinquish_alloc_permit
+
+ // CHECK-LLVM: call void @llvm.nvvm.tcgen05.relinq.alloc.permit.cg2()
+ nvvm.tcgen05.relinquish_alloc_permit {group = #nvvm.tcgen05_group<cta_2>}
+ llvm.return
+}
More information about the Mlir-commits
mailing list