[llvm] [LLVM][NVPTX] Add NVPTX codegen support for fence.proxy.tensormap (PR #100748)
Pradeep Kumar via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 26 07:01:46 PDT 2024
https://github.com/schwarzschild-radius created https://github.com/llvm/llvm-project/pull/100748
This commit adds LLVM Intrinsics and NVPTX codegen support for `fence.proxy.tensormap` with lit tests under fence-proxy-tensormap.ll. Also, added Intrinsics documentation in NVPTXUsage.rst
>From 5fa07dbb719cc35e611d780e3fa8bf203ec187fb Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Fri, 26 Jul 2024 07:46:14 +0530
Subject: [PATCH] [LLVM][NVPTX] Add NVPTX codegen support for
fence.proxy.tensormap
This commit adds LLVM Intrinsics and NVPTX codegen support for
`fence.proxy.tensormap` with lit tests under fence-proxy-tensormap.ll.
Also, added Intrinsics documentation in NVPTXUsage.rst
---
llvm/docs/NVPTXUsage.rst | 28 +++++++++++++
llvm/include/llvm/IR/IntrinsicsNVVM.td | 14 +++++++
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 42 +++++++++++++++++++
.../CodeGen/NVPTX/fence-proxy-tensormap.ll | 36 ++++++++++++++++
4 files changed, 120 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 4385cba3ada0d..2b87985c78bbd 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -251,6 +251,34 @@ Overview:
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
+Membar/Fences
+-------------
+
+
+'``llvm.nvvm.fence.proxy.tensormap.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.fence.proxy.tensormap.release.cta()
+ declare void @llvm.nvvm.fence.proxy.tensormap.release.cluster()
+ declare void @llvm.nvvm.fence.proxy.tensormap.release.gpu()
+ declare void @llvm.nvvm.fence.proxy.tensormap.release.sys()
+
+ declare void @llvm.nvvm.fence.proxy.tensormap.acquire.cta(ptr %addr, i32 %size)
+ declare void @llvm.nvvm.fence.proxy.tensormap.acquire.cluster(ptr %addr, i32 %size)
+ declare void @llvm.nvvm.fence.proxy.tensormap.acquire.gpu(ptr %addr, i32 %size)
+ declare void @llvm.nvvm.fence.proxy.tensormap.acquire.sys(ptr %addr, i32 %size)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.fence.proxy.tensormap.release.*``' intrinsic emits ``fence.proxy.tensormap::generic.release.*`` and '``@llvm.nvvm.fence.proxy.tensormap.acquire.*``' intrinsic emits ``fence.proxy.tensormap::generic.acquire.* [addr], size;``. ``nvvm.fence.proxy.tensormap*`` is a uni-directional fence used to establish ordering between memory accesses that may happen through different proxies. ``nvvm.fence.proxy.tensormap.release`` can form a release sequence that synchronizes with an acquire sequence that contains the ``nvvm.fence.proxy.tensormap.acquire`` proxy fence
+
+The address operand ``addr`` and the operand ``size`` together specifies the memory range ``[addr, addr+size-1]`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_.
Other Intrinsics
----------------
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 1e7fdb53059e2..cf4e7ee1d991e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1418,6 +1418,20 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_fence_sc_cluster:
Intrinsic<[], [], [IntrNoCallback]>;
+// Proxy fence (uni-directional)
+foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+
+ def int_nvvm_fence_proxy_tensormap_release_ # scope:
+ Intrinsic<[], [], [IntrNoCallback],
+ "llvm.nvvm.fence.proxy.tensormap.release." # scope>;
+
+ def int_nvvm_fence_proxy_tensormap_acquire_ # scope:
+ Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
+ [IntrNoCallback, ImmArg<ArgIndex<1>>],
+ "llvm.nvvm.fence.proxy.tensormap.acquire." # scope>;
+
+}
+
// Async Copy
def int_nvvm_cp_async_mbarrier_arrive :
ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive">,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c81dfa68e4bd4..80ad91137689f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -335,6 +335,48 @@ def INT_FENCE_SC_CLUSTER:
MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
Requires<[hasPTX<78>, hasSM<90>]>;
+// Proxy fence (uni-directional)
+// fence.proxy.tensormap.release variants
+
+class FENCE_PROXY_TENSORMAP_RELEASE<string Scope, Intrinsic Intr> :
+ NVPTXInst<(outs), (ins),
+ "fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>,
+ Requires<[hasPTX<83>, hasSM<90>]>;
+
+def INT_FENCE_PROXY_TENSORMAP_RELEASE_CTA:
+ FENCE_PROXY_TENSORMAP_RELEASE<"cta",
+ int_nvvm_fence_proxy_tensormap_release_cta>;
+def INT_FENCE_PROXY_TENSORMAP_RELEASE_CLUSTER:
+ FENCE_PROXY_TENSORMAP_RELEASE<"cluster",
+ int_nvvm_fence_proxy_tensormap_release_cluster>;
+def INT_FENCE_PROXY_TENSORMAP_RELEASE_GPU:
+ FENCE_PROXY_TENSORMAP_RELEASE<"gpu",
+ int_nvvm_fence_proxy_tensormap_release_gpu>;
+def INT_FENCE_PROXY_TENSORMAP_RELEASE_SYS:
+ FENCE_PROXY_TENSORMAP_RELEASE<"sys",
+ int_nvvm_fence_proxy_tensormap_release_sys>;
+
+// fence.proxy.tensormap.acquire variants
+
+class FENCE_PROXY_TENSORMAP_ACQUIRE<string Scope, Intrinsic Intr> :
+ NVPTXInst<(outs), (ins Int64Regs:$addr, i32imm:$size),
+ "fence.proxy.tensormap::generic.acquire." # Scope # " [$addr], $size;",
+ [(Intr Int64Regs:$addr, timm:$size)]>,
+ Requires<[hasPTX<83>, hasSM<90>]>;
+
+def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_CTA :
+ FENCE_PROXY_TENSORMAP_ACQUIRE<"cta",
+ int_nvvm_fence_proxy_tensormap_acquire_cta>;
+def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_CLUSTER :
+ FENCE_PROXY_TENSORMAP_ACQUIRE<"cluster",
+ int_nvvm_fence_proxy_tensormap_acquire_cluster>;
+def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_GPU :
+ FENCE_PROXY_TENSORMAP_ACQUIRE<"gpu",
+ int_nvvm_fence_proxy_tensormap_acquire_gpu>;
+def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_SYS :
+ FENCE_PROXY_TENSORMAP_ACQUIRE<"sys",
+ int_nvvm_fence_proxy_tensormap_acquire_sys>;
+
//-----------------------------------
// Async Copy Functions
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll
new file mode 100644
index 0000000000000..72f82aad840a5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.5 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: test_fence_proxy_tensormap_release
+define void @test_fence_proxy_tensormap_release() {
+ ; CHECK: fence.proxy.tensormap::generic.release.cta;
+ call void @llvm.nvvm.fence.proxy.tensormap.release.cta();
+
+ ; CHECK: fence.proxy.tensormap::generic.release.cluster;
+ call void @llvm.nvvm.fence.proxy.tensormap.release.cluster();
+
+ ; CHECK: fence.proxy.tensormap::generic.release.gpu;
+ call void @llvm.nvvm.fence.proxy.tensormap.release.gpu();
+
+ ; CHECK: fence.proxy.tensormap::generic.release.sys;
+ call void @llvm.nvvm.fence.proxy.tensormap.release.sys();
+
+ ret void
+}
+
+; CHECK-LABEL: test_fence_proxy_tensormap_acquire
+define void @test_fence_proxy_tensormap_acquire(ptr addrspace(0) %addr) {
+ ; CHECK: fence.proxy.tensormap::generic.acquire.cta [%rd{{[0-9]+}}], 128;
+ call void @llvm.nvvm.fence.proxy.tensormap.acquire.cta(ptr addrspace(0) %addr, i32 128);
+
+ ; CHECK: fence.proxy.tensormap::generic.acquire.cluster [%rd{{[0-9]+}}], 128;
+ call void @llvm.nvvm.fence.proxy.tensormap.acquire.cluster(ptr addrspace(0) %addr, i32 128);
+
+ ; CHECK: fence.proxy.tensormap::generic.acquire.gpu [%rd{{[0-9]+}}], 128;
+ call void @llvm.nvvm.fence.proxy.tensormap.acquire.gpu(ptr addrspace(0) %addr, i32 128);
+
+ ; CHECK: fence.proxy.tensormap::generic.acquire.sys [%rd{{[0-9]+}}], 128;
+ call void @llvm.nvvm.fence.proxy.tensormap.acquire.sys(ptr addrspace(0) %addr, i32 128);
+
+ ret void
+}
More information about the llvm-commits
mailing list