[llvm] [NVPTX] Add Intrinsics for discard.* (PR #128404)
Abhilash Majumder via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 27 07:39:50 PST 2025
https://github.com/abhilash1910 updated https://github.com/llvm/llvm-project/pull/128404
>From e47430fd596637c39b326e25d7d74579dcc7dbcf Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Sun, 23 Feb 2025 12:29:13 +0530
Subject: [PATCH 1/4] add discard intrinsics
---
llvm/docs/NVPTXUsage.rst | 25 +++++++++++++++++
llvm/include/llvm/IR/IntrinsicsNVVM.td | 9 ++++++
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 11 ++++++++
llvm/test/CodeGen/NVPTX/discard.ll | 35 ++++++++++++++++++++++++
4 files changed, 80 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/discard.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 675b458c41e7b..d905d9b56fe29 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -630,6 +630,31 @@ uses and eviction priority which can be accessed by the '``.level::eviction_prio
For more information, refer to the PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
+``llvm.nvvm.discard.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
+ declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.discard.*``' invalidates the data at the address range [a .. a + (size - 1)]
+in the cache level specified by the .level qualifier without writing back the data
+in the cache to the memory. The operand size is an integer constant that specifies the amount of data,
+in bytes, in the cache level specified by the .level qualifier to be discarded. The only supported value
+for the size operand is 128. If no state space is specified then Generic Addressing is used.
+If the specified address does not fall within the address window of .global state space then
+the behavior is undefined.
+
+For more information, refer to the PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`_.
+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index c32bf0318b5d6..00613eb7d2d17 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5043,6 +5043,15 @@ def int_nvvm_prefetch_global_L2_evict_last: Intrinsic<[], [llvm_global_ptr_ty],
def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty],
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+// Intrinsics for discard
+def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[],
+ [llvm_global_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
+ NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
+def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
+ [llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
+ NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
// Intrinsics for Bulk Copy using TMA (non-tensor)
// From Global to Shared Cluster
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index acb9fc9867b0f..fd93ce312c9db 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -789,6 +789,17 @@ def PREFETCH_GLOBAL_L2_EVICT_LAST : NVPTXInst<(outs), (ins Int64Regs:$addr),
def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
+//Discard Intrinsics
+class DISCARD_L2_INTRS<string Addr> :
+ NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
+ StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], $size;",
+ [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
+ i64:$addr, i64:$size)]>,
+ Requires<[hasPTX<74>, hasSM<80>]>;
+
+def DISCARD_L2 : DISCARD_L2_INTRS<"">;
+def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
+
//-----------------------------------
// MBarrier Functions
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
new file mode 100644
index 0000000000000..80217807765d0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
+declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+
+define void @discard_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
+; CHECK-PTX64-LABEL: discard_global_L2(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_global_L2_param_0];
+; CHECK-PTX64-NEXT: discard.global.L2 [%rd1], 128;
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128)
+ ret void
+}
+
+define void @discard_L2(ptr %ptr, i64 %size) {
+; CHECK-PTX64-LABEL: discard_L2(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_L2_param_0];
+; CHECK-PTX64-NEXT: discard.L2 [%rd1], 128;
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128)
+ ret void
+}
+
>From 60a6bed3c6119b07fa3628042dcaf00c4a7e330e Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Mon, 24 Feb 2025 08:56:05 +0530
Subject: [PATCH 2/4] refine fix size by 128
---
llvm/docs/NVPTXUsage.rst | 4 ++--
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 6 +++---
llvm/test/CodeGen/NVPTX/discard.ll | 8 ++++----
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index d905d9b56fe29..c002d927be0ce 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -638,8 +638,8 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
- declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+ declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
+ declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
Overview:
"""""""""
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index fd93ce312c9db..eac57d1fac77e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -791,10 +791,10 @@ def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
//Discard Intrinsics
class DISCARD_L2_INTRS<string Addr> :
- NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
- StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], $size;",
+ NVPTXInst<(outs), (ins Int64Regs:$addr),
+ StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;",
[(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
- i64:$addr, i64:$size)]>,
+ i64:$addr, (i64 128))]>,
Requires<[hasPTX<74>, hasSM<80>]>;
def DISCARD_L2 : DISCARD_L2_INTRS<"">;
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
index 80217807765d0..7f70232f01052 100644
--- a/llvm/test/CodeGen/NVPTX/discard.ll
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -4,10 +4,10 @@
target triple = "nvptx64-nvidia-cuda"
-declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
-declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
+declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
-define void @discard_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
+define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
; CHECK-PTX64-LABEL: discard_global_L2(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
@@ -20,7 +20,7 @@ define void @discard_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
ret void
}
-define void @discard_L2(ptr %ptr, i64 %size) {
+define void @discard_L2(ptr %ptr) {
; CHECK-PTX64-LABEL: discard_L2(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
>From e320b70737aa559bd660284bbf94652fca855b2f Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Mon, 24 Feb 2025 13:36:15 +0530
Subject: [PATCH 3/4] refine tests
---
llvm/test/CodeGen/NVPTX/discard.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
index 7f70232f01052..8e5c9bab97c8d 100644
--- a/llvm/test/CodeGen/NVPTX/discard.ll
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -4,8 +4,8 @@
target triple = "nvptx64-nvidia-cuda"
-declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
-declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
+declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg %size)
+declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg %size)
define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
; CHECK-PTX64-LABEL: discard_global_L2(
>From f745ad8aa7c9a5086b1532bf1b07f0e3f7887104 Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Tue, 25 Feb 2025 13:24:04 +0530
Subject: [PATCH 4/4] refine
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 00613eb7d2d17..b67c7350188a8 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5046,11 +5046,11 @@ def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty],
// Intrinsics for discard
def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[],
[llvm_global_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
- NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+ NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
[llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
- NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+ NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
// Intrinsics for Bulk Copy using TMA (non-tensor)
More information about the llvm-commits
mailing list