[llvm] [NVPTX] Add Intrinsics for applypriority.* (PR #127989)

Thu Feb 20 03:41:31 PST 2025

https://github.com/abhilash1910 updated https://github.com/llvm/llvm-project/pull/127989

>From 015908adee722ed23dcef0daba88caff0de8519f Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Thu, 20 Feb 2025 16:38:41 +0530
Subject: [PATCH 1/3] applypriority intrinsics

---
 llvm/docs/NVPTXUsage.rst                 | 24 +++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   | 10 +++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 16 +++++++++++
 llvm/test/CodeGen/NVPTX/applypriority.ll | 34 ++++++++++++++++++++++++
 4 files changed, 84 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/applypriority.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 675b458c41e7b..61ae07816bdfd 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -630,6 +630,30 @@ uses and eviction priority which can be accessed by the '``.level::eviction_prio
 For more information, refer to the PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
 
+'``llvm.nvvm.applypriority.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void  @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 %size)
+  declare void  @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 %size)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.applypriority.*``'  applies the cache eviction priority specified by the
+.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache 
+level. If no state space is specified then Generic Addressing is used. If the specified address 
+does not fall within the address window of .global state space then the behavior is undefined.
+The operand size is an integer constant that specifies the amount of data, in bytes, in the specified cache
+level on which the priority is to be applied. The only supported value for the size operand is 128.
+
+For more information, refer to the PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-applypriority>`_.
+
 '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index c32bf0318b5d6..eff860bc3a850 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5043,6 +5043,16 @@ def int_nvvm_prefetch_global_L2_evict_last: Intrinsic<[], [llvm_global_ptr_ty],
 def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty],
   [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
 
+def int_nvvm_applypriority_global_L2_evict_normal
+  : Intrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty],
+              [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
+               ImmArg<ArgIndex<1>>]>;
+
+def int_nvvm_applypriority_L2_evict_normal
+  : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
+              [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
+               ImmArg<ArgIndex<1>>]>;
+
 
 // Intrinsics for Bulk Copy using TMA (non-tensor)
 // From Global to Shared Cluster
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index ed7963f35a7c7..78e0621fb52d9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -789,6 +789,22 @@ def PREFETCH_GLOBAL_L2_EVICT_LAST   : NVPTXInst<(outs), (ins Int64Regs:$addr),
 
 def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
 
+//Applypriority intrinsics
+multiclass APPLYPRIORITY_L2_INTRS<string addr> {
+  defvar InstName = "applypriority."
+                    # !if(!eq(addr, ""), "", addr # ".") 
+                    # "L2::evict_normal";
+                    
+  def APPLYPRIORITY_L2 :  NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
+                          InstName # " [$addr], $size;",
+                          [(!cast<Intrinsic>("int_nvvm_" # !subst("::", "_", !subst(".", "_", InstName)))
+                          i64:$addr, i64:$size)]>,
+                          Requires<[hasPTX<80>, hasSM<90>]>;
+}
+
+defm APPLYPRIORITY_L2_EVICT_NORMAL      : APPLYPRIORITY_L2_INTRS<"">;
+defm APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
+
 //-----------------------------------
 // MBarrier Functions
 //-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/applypriority.ll b/llvm/test/CodeGen/NVPTX/applypriority.ll
new file mode 100644
index 0000000000000..51998f4d850c2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/applypriority.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void  @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 %size)
+declare void  @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 %size)
+
+define void @applypriority_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
+; CHECK-PTX64-LABEL: applypriority_global_L2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [applypriority_global_L2_param_0];
+; CHECK-PTX64-NEXT:    applypriority.global.L2::evict_normal [%rd1], 128;
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 128)
+  ret void
+}
+
+define void @applypriority_L2(ptr %ptr, i64 %size) {
+; CHECK-PTX64-LABEL: applypriority_L2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [applypriority_L2_param_0];
+; CHECK-PTX64-NEXT:    applypriority.L2::evict_normal [%rd1], 128;
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 128)
+  ret void
+}

>From 38a8fc12ee8e547f2184536098ad2a0c4cbe3a4c Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Thu, 20 Feb 2025 17:05:07 +0530
Subject: [PATCH 2/3] remove multiclass and refine

---
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 78e0621fb52d9..bd8077e2dd887 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -790,20 +790,15 @@ def PREFETCH_GLOBAL_L2_EVICT_LAST   : NVPTXInst<(outs), (ins Int64Regs:$addr),
 def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
 
 //Applypriority intrinsics
-multiclass APPLYPRIORITY_L2_INTRS<string addr> {
-  defvar InstName = "applypriority."
-                    # !if(!eq(addr, ""), "", addr # ".") 
-                    # "L2::evict_normal";
-                    
-  def APPLYPRIORITY_L2 :  NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
-                          InstName # " [$addr], $size;",
-                          [(!cast<Intrinsic>("int_nvvm_" # !subst("::", "_", !subst(".", "_", InstName)))
-                          i64:$addr, i64:$size)]>,
-                          Requires<[hasPTX<80>, hasSM<90>]>;
-}
-
-defm APPLYPRIORITY_L2_EVICT_NORMAL      : APPLYPRIORITY_L2_INTRS<"">;
-defm APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
+class APPLYPRIORITY_L2_INTRS<string InstName> :
+          NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
+          InstName # " [$addr], $size;",
+          [(!cast<Intrinsic>("int_nvvm_" # !subst("::", "_", !subst(".", "_", InstName)))
+          i64:$addr, i64:$size)]>,
+          Requires<[hasPTX<80>, hasSM<90>]>;
+
+def APPLYPRIORITY_L2_EVICT_NORMAL        : APPLYPRIORITY_L2_INTRS<"applypriority.L2::evict_normal">;
+def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"applypriority.global.L2::evict_normal">;
 
 //-----------------------------------
 // MBarrier Functions

>From 4ebdc2cfa92ca38ec63810e2ef75b0e29d1b6317 Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Thu, 20 Feb 2025 17:11:04 +0530
Subject: [PATCH 3/3] refine versions

---
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index bd8077e2dd887..1ca7d7e87eb08 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -795,7 +795,7 @@ class APPLYPRIORITY_L2_INTRS<string InstName> :
           InstName # " [$addr], $size;",
           [(!cast<Intrinsic>("int_nvvm_" # !subst("::", "_", !subst(".", "_", InstName)))
           i64:$addr, i64:$size)]>,
-          Requires<[hasPTX<80>, hasSM<90>]>;
+          Requires<[hasPTX<74>, hasSM<80>]>;
 
 def APPLYPRIORITY_L2_EVICT_NORMAL        : APPLYPRIORITY_L2_INTRS<"applypriority.L2::evict_normal">;
 def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"applypriority.global.L2::evict_normal">;