[llvm] [LLVM][NVPTX] Add support for tensormap.cp_fenceproxy (PR #107555)

Fri Sep 6 03:01:48 PDT 2024

https://github.com/schwarzschild-radius created https://github.com/llvm/llvm-project/pull/107555

This commit adds NVPTX codegen support for tensormap.cp_fenceproxy instruction with lit tests under tensormap-cp-fence-proxy.ll. The commit also adds documentation for the intrinsics in NVPTXUsage.rst

>From 20e6806547d441854b8f008ba638183a856f67e7 Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Tue, 3 Sep 2024 19:29:15 +0530
Subject: [PATCH] [LLVM][NVPTX] Add support for tensormap.cp_fenceproxy

This commit adds NVPTX codegen support for tensormap.cp_fenceproxy instruction with lit tests under tensormap-cp-fence-proxy.ll. The commit also adds documentation for the intrinsics in NVPTXUsage.rst
---
 llvm/docs/NVPTXUsage.rst                      | 30 +++++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  8 +++++
 llvm/lib/IR/Verifier.cpp                      | 12 ++++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 25 ++++++++++++++++
 .../CodeGen/NVPTX/tensormap-cp-fence-proxy.ll | 19 ++++++++++++
 5 files changed, 94 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/tensormap-cp-fence-proxy.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 3a566bbac36233..62f74dabdec20c 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -313,6 +313,36 @@ The ``@llvm.nvvm.fence.proxy.tensormap_generic.*`` is a uni-directional fence us
 
 The address operand ``addr`` and the operand ``size`` together specify the memory range ``[addr, addr+size)`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand addr must fall within the ``.global`` state space. Otherwise, the behavior is undefined. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_.
 
+'``llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.*.sync.aligned``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.cta.sync.aligned(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size)
+  declare void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.cluster.sync.aligned(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size)
+  declare void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.gpu.sync.aligned(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size)
+  declare void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.sync.sync.aligned(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size)
+
+Overview:
+"""""""""
+
+The ``@llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.*.sync.aligned`` is a fused copy and fence operation. The Intrinsic performs the following operations in order:
+
+1. Copies data of size specified by the ``size`` argument, in bytes, from the location specified by the address operand ``src`` in shared memory to the location specified by the address operand ``dst`` in the global memory, using the generic proxy.
+
+2. Establishes a uni-directional proxy ``release`` pattern on the ordering from the copy operation to subsequent accesses performed in the ``tensormap proxy`` on the address ``dst``.
+
+The only valid value of ``size`` operand is ``128`` and must be an immediate.
+
+The operands ``src`` and ``dst`` specify non-generic addresses in ``shared::cta`` and ``global`` state space respectively.
+
+The mandatory ``.sync`` qualifier indicates that ``tensormap.cp_fenceproxy`` causes the executing thread to wait until all threads in the warp execute the same ``tensormap.cp_fenceproxy`` intrinsic before resuming execution.
+
+The mandatory ``.aligned`` qualifier indicates that all threads in the warp must execute the same ``tensormap.cp_fenceproxy`` intrinsic. In conditionally executed code, an aligned ``tensormap.cp_fenceproxy`` intrinsic should only be used if it is known that all threads in the warp evaluate the condition identically, otherwise behavior is undefined. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy>`_.
+
 Arithmetic Intrinsics
 ---------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 39685c920d948d..a0ee3aa2286159 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4889,6 +4889,14 @@ def int_nvvm_setmaxnreg_dec_sync_aligned_u32
               [IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>],
               "llvm.nvvm.setmaxnreg.dec.sync.aligned.u32">;
 
+// tensormap.cp_fenceproxy
+foreach scope = ["cta", "cluster", "gpu", "sys"] in {
+  def int_nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_ # scope # _sync_aligned:
+    Intrinsic<[], [llvm_global_ptr_ty, llvm_shared_ptr_ty, llvm_i32_ty],
+    [IntrConvergent, IntrNoCallback, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
+    "llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release." # scope # ".sync.aligned">;
+}
+
 // Exit
 def int_nvvm_exit : ClangBuiltin<"__nvvm_exit">,
     Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>;
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d8f3bab45b2a65..00b4d989a03b56 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6335,6 +6335,18 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Check(size == 128, " The only supported value for size operand is 128");
     break;
   }
+  case Intrinsic::
+      nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_cta_sync_aligned:
+  case Intrinsic::
+      nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_cluster_sync_aligned:
+  case Intrinsic::
+      nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_gpu_sync_aligned:
+  case Intrinsic::
+      nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_sys_sync_aligned: {
+    unsigned size = cast<ConstantInt>(Call.getArgOperand(2))->getZExtValue();
+    Check(size == 128, " The only supported value for size operand is 128");
+    break;
+  }
   };
 
   // Verify that there aren't any unmediated control transfers between funclets.
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 0c883093dd0a54..2950a2cc09db1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -7001,4 +7001,29 @@ defm INT_SET_MAXNREG_DEC : SET_MAXNREG<"dec", int_nvvm_setmaxnreg_dec_sync_align
 
 } // isConvergent
 
+class TENSORMAP_CP_FENCE_PROXY_GLOBAL_SHARED_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> :
+        NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$size),
+                  "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release." # Scope # ".sync.aligned [$dst], [$src], $size;", [(Intr Int64Regs:$dst, Int64Regs:$src, timm:$size)]>,
+        Requires<[hasPTX<83>, hasSM<90>]>;
+
+let isConvergent = true in {
+
+def INT_PTX_TENSORMAP_CP_FENCEPROXY_GLOBAL_SHARED_TENSORMAP_RELEASE_CTA_SYNC_ALIGNED:
+      TENSORMAP_CP_FENCE_PROXY_GLOBAL_SHARED_TENSORMAP_GENERIC_RELEASE<"cta",
+        int_nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_cta_sync_aligned>;
+
+def INT_PTX_TENSORMAP_CP_FENCEPROXY_GLOBAL_SHARED_TENSORMAP_RELEASE_CLUSTER_SYNC_ALIGNED:
+      TENSORMAP_CP_FENCE_PROXY_GLOBAL_SHARED_TENSORMAP_GENERIC_RELEASE<"cluster",
+        int_nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_cluster_sync_aligned>;
+
+def INT_PTX_TENSORMAP_CP_FENCEPROXY_GLOBAL_SHARED_TENSORMAP_RELEASE_GPU_SYNC_ALIGNED:
+      TENSORMAP_CP_FENCE_PROXY_GLOBAL_SHARED_TENSORMAP_GENERIC_RELEASE<"gpu",
+        int_nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_gpu_sync_aligned>;
+
+def INT_PTX_TENSORMAP_CP_FENCEPROXY_GLOBAL_SHARED_TENSORMAP_RELEASE_SYS_SYNC_ALIGNED:
+      TENSORMAP_CP_FENCE_PROXY_GLOBAL_SHARED_TENSORMAP_GENERIC_RELEASE<"sys",
+        int_nvvm_tensormap_cp_fenceproxy_global_shared_tensormap_generic_release_sys_sync_aligned>;
+
+}
+
 def INT_EXIT : NVPTXInst<(outs), (ins), "exit;", [(int_nvvm_exit)]>;
diff --git a/llvm/test/CodeGen/NVPTX/tensormap-cp-fence-proxy.ll b/llvm/test/CodeGen/NVPTX/tensormap-cp-fence-proxy.ll
new file mode 100644
index 00000000000000..13098fffef3df1
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tensormap-cp-fence-proxy.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s
+
+; CHECK-LABEL: test_tensormap_cp_fenceproxy
+define void @test_tensormap_cp_fenceproxy(ptr addrspace(1) %gptr, ptr addrspace(3) %sptr) {
+
+  ; CHECK: tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [{{%rd[0-9]+}}], [{{%rd[0-9]+}}], 128;
+  call void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.cta.sync.aligned(ptr addrspace(1) %gptr, ptr addrspace(3) %sptr, i32 128)
+
+  ; CHECK: tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [{{%rd[0-9]+}}], [{{%rd[0-9]+}}], 128;
+  call void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.cluster.sync.aligned(ptr addrspace(1) %gptr, ptr addrspace(3) %sptr, i32 128)
+
+  ; CHECK: tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [{{%rd[0-9]+}}], [{{%rd[0-9]+}}], 128;
+  call void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.gpu.sync.aligned(ptr addrspace(1) %gptr, ptr addrspace(3) %sptr, i32 128)
+
+  ; CHECK: tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [{{%rd[0-9]+}}], [{{%rd[0-9]+}}], 128;
+  call void @llvm.nvvm.tensormap.cp_fenceproxy.global.shared.tensormap_generic.release.sys.sync.aligned(ptr addrspace(1) %gptr, ptr addrspace(3) %sptr, i32 128)
+
+  ret void
+}
\ No newline at end of file