[Mlir-commits] [mlir] [MLIR][NVVM] Add support for fence.proxy.{acquire, release} Ops (PR #106689)
Pradeep Kumar
llvmlistbot at llvm.org
Fri Aug 30 01:44:08 PDT 2024
https://github.com/schwarzschild-radius created https://github.com/llvm/llvm-project/pull/106689
This commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic.
>From afe887dacfcfb3253f1725aa14d2e3b0228b98d2 Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Mon, 26 Aug 2024 18:46:12 +0530
Subject: [PATCH] [MLIR][NVVM] Add support for fence.proxy.{acquire, release}
Ops
This commit adds fence.proxy.acquire and fence.proxy.release Ops which map to uni-directional proxy fences in PTX with lowering tests and negative tests under nvvmir.mlir and nvvmir-invalid.mlir respectively. The commits also adds a new MemScopeKind attribute and extends the current ProxyKindAttr to support tensormap and generic.
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 79 ++++++++++++++++++-
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 28 +++++++
.../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 34 ++++++++
mlir/test/Target/LLVMIR/nvvmir-invalid.mlir | 33 ++++++++
mlir/test/Target/LLVMIR/nvvmir.mlir | 37 +++++++++
5 files changed, 210 insertions(+), 1 deletion(-)
create mode 100644 mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 4d48b3de7a57ed..709dd922b8fa2f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -19,6 +19,7 @@ include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
+def LLVM_PointerGeneric : LLVM_PointerInAddressSpace<0>;
def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
def LLVM_PointerShared : LLVM_PointerInAddressSpace<3>;
@@ -531,8 +532,10 @@ def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
def ProxyAsync : I32EnumAttrCase<"async", 1, "async">;
def ProxyAsyncGlobal : I32EnumAttrCase<"async_global", 2, "async.global">;
def ProxyAsyncShared : I32EnumAttrCase<"async_shared", 3, "async.shared">;
+def ProxyTensorMap : I32EnumAttrCase<"TENSORMAP", 4, "tensormap">;
+def ProxyGeneric : I32EnumAttrCase<"GENERIC", 5, "generic">;
def ProxyKind : I32EnumAttr<"ProxyKind", "Proxy kind",
- [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared]> {
+ [ProxyAlias, ProxyAsync, ProxyAsyncGlobal, ProxyAsyncShared, ProxyTensorMap, ProxyGeneric]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::NVVM";
}
@@ -565,6 +568,80 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
let hasVerifier = 1;
}
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+ [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
+ Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::TENSORMAP">:$toProxy)> {
+ let summary = "Uni-directional proxy fence operation with acquire semantics";
+ let description = [{
+ `fence.proxy.acquire` is a uni-directional fence used to establish ordering
+ between a prior memory access performed via the generic proxy and a
+ subsequent memory access performed via the tensormap proxy
+
+ The address operand `addr` and the operand `size` together specify the
+ memory range `[addr, addr+size)` on which the ordering guarantees on the
+ memory accesses across the proxies is to be provided. The only supported
+ value for the `size` operand is 128 and must be an immediate. Generic Addressing
+ is used unconditionally, and the address specified by the operand `addr` must
+ fall within the `.global` state space. Otherwise, the behavior is undefined
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope $addr `,` $size (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(
+ builder,
+ getUnidirectionalFenceProxyID($fromProxy, $toProxy, $scope, false),
+ {$addr, $size});
+ }];
+
+ let hasVerifier = 1;
+}
+
+def NVVM_FenceProxyReleaseOp : NVVM_Op<"fence.proxy.release">,
+ Arguments<(ins MemScopeKindAttr:$scope,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::GENERIC">:$fromProxy,
+ DefaultValuedAttr<ProxyKindAttr,
+ "ProxyKind::TENSORMAP">:$toProxy)> {
+ let summary = "Uni-directional proxy fence operation with release semantics";
+ let description = [{
+ `fence.proxy.release` is a uni-directional fence used to establish ordering
+ between a prior memory access performed via the generic proxy and a
+ subsequent memory access performed via the tensormap proxy. `fence.proxy.release`
+ operation can form a release sequence that synchronizes with an acquire
+ sequence that contains the fence.proxy.acquire proxy fence operation
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar)
+ }];
+
+ let assemblyFormat = "$scope (`from_proxy` `=` $fromProxy^)? (`to_proxy` `=` $toProxy^)? attr-dict";
+ let llvmBuilder = [{
+ createIntrinsicCall(builder, getUnidirectionalFenceProxyID(
+ $fromProxy, $toProxy, $scope, true));
+ }];
+
+ let hasVerifier = 1;
+}
+
def SetMaxRegisterActionIncrease : I32EnumAttrCase<"increase", 0>;
def SetMaxRegisterActionDecrease : I32EnumAttrCase<"decrease", 1>;
def SetMaxRegisterAction : I32EnumAttr<"SetMaxRegisterAction", "NVVM set max register action",
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 4d1896551101ed..2c7c3e9d535f7d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1004,6 +1004,10 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
}
}
LogicalResult NVVM::FenceProxyOp::verify() {
+ if (getKind() == NVVM::ProxyKind::TENSORMAP)
+ return emitOpError() << "tensormap proxy is not a supported proxy kind";
+ if (getKind() == NVVM::ProxyKind::GENERIC)
+ return emitOpError() << "generic proxy not a supported proxy kind";
if (getKind() == NVVM::ProxyKind::async_shared && !getSpace().has_value()) {
return emitOpError() << "async_shared fence requires space attribute";
}
@@ -1013,6 +1017,30 @@ LogicalResult NVVM::FenceProxyOp::verify() {
return success();
}
+LogicalResult NVVM::FenceProxyAcquireOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+ return emitOpError("uni-directional proxies only support tensormap "
+ "for to_proxy attribute");
+
+ return success();
+}
+
+LogicalResult NVVM::FenceProxyReleaseOp::verify() {
+ if (getFromProxy() != NVVM::ProxyKind::GENERIC)
+ return emitOpError("uni-directional proxies only support generic for "
+ "from_proxy attribute");
+
+ if (getToProxy() != NVVM::ProxyKind::TENSORMAP)
+ return emitOpError("uni-directional proxies only support tensormap "
+ "for to_proxy attribute");
+
+ return success();
+}
+
LogicalResult NVVM::SetMaxRegisterOp::verify() {
if (getRegCount() % 8)
return emitOpError("new register size must be multiple of 8");
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index a09c24dda82afc..f93e1cc8780c79 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -120,6 +120,40 @@ static llvm::Intrinsic::ID getLdMatrixIntrinsicId(NVVM::MMALayout layout,
}
}
+static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
+ NVVM::ProxyKind toProxy,
+ NVVM::MemScopeKind scope,
+ bool isRelease) {
+ if (fromProxy == NVVM::ProxyKind::GENERIC &&
+ toProxy == NVVM::ProxyKind::TENSORMAP) {
+ switch (scope) {
+ case NVVM::MemScopeKind::CTA: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_cta;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta;
+ }
+ case NVVM::MemScopeKind::CLUSTER: {
+ if (isRelease)
+ return llvm::Intrinsic::
+ nvvm_fence_proxy_tensormap_generic_release_cluster;
+ return llvm::Intrinsic::
+ nvvm_fence_proxy_tensormap_generic_acquire_cluster;
+ }
+ case NVVM::MemScopeKind::GPU: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_gpu;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu;
+ }
+ case NVVM::MemScopeKind::SYS: {
+ if (isRelease)
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_release_sys;
+ return llvm::Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys;
+ }
+ }
+ llvm_unreachable("Unknown scope for uni-directional fence.proxy operation");
+ }
+}
+
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
new file mode 100644
index 00000000000000..0e563808da970b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+ // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support generic for from_proxy attribute}}
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+ // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support generic for from_proxy attribute}}
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<tensormap> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_acquire(%addr : !llvm.ptr, %size : i32) {
+ // expected-error @below {{'nvvm.fence.proxy.acquire' op uni-directional proxies only support tensormap for to_proxy attribute}}
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %size from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_fence_proxy_release() {
+ // expected-error @below {{'nvvm.fence.proxy.release' op uni-directional proxies only support tensormap for to_proxy attribute}}
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta> from_proxy=#nvvm.proxy_kind<generic> to_proxy=#nvvm.proxy_kind<generic>
+ llvm.return
+}
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index a8ae4d97888c90..6e2787d121ae64 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -574,3 +574,40 @@ llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant})
llvm.func @kernel_func(%arg0: !llvm.ptr {llvm.byval = i32, nvvm.grid_constant}, %arg1: f32, %arg2: !llvm.ptr {llvm.byval = f32, nvvm.grid_constant}) attributes {nvvm.kernel} {
llvm.return
}
+
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_release
+llvm.func @nvvm_fence_proxy_tensormap_generic_release() {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cta>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
+ nvvm.fence.proxy.release #nvvm.mem_scope<cluster>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
+ nvvm.fence.proxy.release #nvvm.mem_scope<gpu>
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
+ nvvm.fence.proxy.release #nvvm.mem_scope<sys>
+ llvm.return
+}
+
+// -----
+// CHECK-LABEL: @nvvm_fence_proxy_tensormap_generic_acquire
+llvm.func @nvvm_fence_proxy_tensormap_generic_acquire(%addr : !llvm.ptr) {
+ %c128 = llvm.mlir.constant(128) : i32
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cta> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<cluster> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<gpu> %addr, %c128
+
+ // CHECK: call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr {{%[0-9]+}}, i32 128)
+ nvvm.fence.proxy.acquire #nvvm.mem_scope<sys> %addr, %c128
+ llvm.return
+}
\ No newline at end of file
More information about the Mlir-commits
mailing list