[llvm] [NVPTX] Add Intrinsics for discard.* (PR #128404)

Abhilash Majumder via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 28 08:33:03 PST 2025


https://github.com/abhilash1910 updated https://github.com/llvm/llvm-project/pull/128404

>From e47430fd596637c39b326e25d7d74579dcc7dbcf Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Sun, 23 Feb 2025 12:29:13 +0530
Subject: [PATCH 01/10] add discard intrinsics

---
 llvm/docs/NVPTXUsage.rst                 | 25 +++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  9 ++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 11 ++++++++
 llvm/test/CodeGen/NVPTX/discard.ll       | 35 ++++++++++++++++++++++++
 4 files changed, 80 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/discard.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 675b458c41e7b..d905d9b56fe29 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -630,6 +630,31 @@ uses and eviction priority which can be accessed by the '``.level::eviction_prio
 For more information, refer to the PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
 
+``llvm.nvvm.discard.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
+  declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.discard.*``'  invalidates the data at the address range [a .. a + (size - 1)] 
+in the cache level specified by the .level qualifier without writing back the data 
+in the cache to the memory. The operand size is an integer constant that specifies the amount of data, 
+in bytes, in the cache level specified by the .level qualifier to be discarded. The only supported value 
+for the size operand is 128. If no state space is specified then Generic Addressing is used. 
+If the specified address does not fall within the address window of .global state space then 
+the behavior is undefined.
+
+For more information, refer to the PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`_.
+
 '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index c32bf0318b5d6..00613eb7d2d17 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5043,6 +5043,15 @@ def int_nvvm_prefetch_global_L2_evict_last: Intrinsic<[], [llvm_global_ptr_ty],
 def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty],
   [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
 
+// Intrinsics for discard
+def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[], 
+    [llvm_global_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, 
+    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
+def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[], 
+    [llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, 
+    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+
 
 // Intrinsics for Bulk Copy using TMA (non-tensor)
 // From Global to Shared Cluster
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index acb9fc9867b0f..fd93ce312c9db 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -789,6 +789,17 @@ def PREFETCH_GLOBAL_L2_EVICT_LAST   : NVPTXInst<(outs), (ins Int64Regs:$addr),
 
 def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
 
+//Discard Intrinsics
+class DISCARD_L2_INTRS<string Addr> :
+          NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
+          StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], $size;",
+          [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
+          i64:$addr, i64:$size)]>,
+          Requires<[hasPTX<74>, hasSM<80>]>;
+
+def DISCARD_L2        : DISCARD_L2_INTRS<"">;
+def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
+
 //-----------------------------------
 // MBarrier Functions
 //-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
new file mode 100644
index 0000000000000..80217807765d0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
+declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+
+define void @discard_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
+; CHECK-PTX64-LABEL: discard_global_L2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [discard_global_L2_param_0];
+; CHECK-PTX64-NEXT:    discard.global.L2 [%rd1], 128;
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128)
+  ret void
+}
+
+define void @discard_L2(ptr %ptr, i64 %size) {
+; CHECK-PTX64-LABEL: discard_L2(
+; CHECK-PTX64:       {
+; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT:  // %bb.0:
+; CHECK-PTX64-NEXT:    ld.param.u64 %rd1, [discard_L2_param_0];
+; CHECK-PTX64-NEXT:    discard.L2 [%rd1], 128;
+; CHECK-PTX64-NEXT:    ret;
+  tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128)
+  ret void
+}
+

>From 60a6bed3c6119b07fa3628042dcaf00c4a7e330e Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Mon, 24 Feb 2025 08:56:05 +0530
Subject: [PATCH 02/10] refine fix size by 128

---
 llvm/docs/NVPTXUsage.rst                 | 4 ++--
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 6 +++---
 llvm/test/CodeGen/NVPTX/discard.ll       | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index d905d9b56fe29..c002d927be0ce 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -638,8 +638,8 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
-  declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+  declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
+  declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
 
 Overview:
 """""""""
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index fd93ce312c9db..eac57d1fac77e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -791,10 +791,10 @@ def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
 
 //Discard Intrinsics
 class DISCARD_L2_INTRS<string Addr> :
-          NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
-          StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], $size;",
+          NVPTXInst<(outs), (ins Int64Regs:$addr),
+          StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;",
           [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
-          i64:$addr, i64:$size)]>,
+          i64:$addr, (i64 128))]>,
           Requires<[hasPTX<74>, hasSM<80>]>;
 
 def DISCARD_L2        : DISCARD_L2_INTRS<"">;
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
index 80217807765d0..7f70232f01052 100644
--- a/llvm/test/CodeGen/NVPTX/discard.ll
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -4,10 +4,10 @@
 
 target triple = "nvptx64-nvidia-cuda"
 
-declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 %size)
-declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 %size)
+declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
+declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
 
-define void @discard_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
+define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
 ; CHECK-PTX64-LABEL: discard_global_L2(
 ; CHECK-PTX64:       {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;
@@ -20,7 +20,7 @@ define void @discard_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
   ret void
 }
 
-define void @discard_L2(ptr %ptr, i64 %size) {
+define void @discard_L2(ptr %ptr) {
 ; CHECK-PTX64-LABEL: discard_L2(
 ; CHECK-PTX64:       {
 ; CHECK-PTX64-NEXT:    .reg .b64 %rd<2>;

>From e320b70737aa559bd660284bbf94652fca855b2f Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Mon, 24 Feb 2025 13:36:15 +0530
Subject: [PATCH 03/10] refine tests

---
 llvm/test/CodeGen/NVPTX/discard.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
index 7f70232f01052..8e5c9bab97c8d 100644
--- a/llvm/test/CodeGen/NVPTX/discard.ll
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -4,8 +4,8 @@
 
 target triple = "nvptx64-nvidia-cuda"
 
-declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
-declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
+declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg %size)
+declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg %size)
 
 define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
 ; CHECK-PTX64-LABEL: discard_global_L2(

>From f745ad8aa7c9a5086b1532bf1b07f0e3f7887104 Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Tue, 25 Feb 2025 13:24:04 +0530
Subject: [PATCH 04/10] refine

---
 llvm/include/llvm/IR/IntrinsicsNVVM.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 00613eb7d2d17..b67c7350188a8 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5046,11 +5046,11 @@ def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty],
 // Intrinsics for discard
 def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[], 
     [llvm_global_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, 
-    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
 
 def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[], 
     [llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, 
-    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>;
+    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
 
 
 // Intrinsics for Bulk Copy using TMA (non-tensor)

>From b96a256001f3076f81021d3635c472933df23e4f Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Thu, 27 Feb 2025 22:07:11 +0530
Subject: [PATCH 05/10] refine docs

---
 llvm/docs/NVPTXUsage.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index bf1e7bd1a6bf0..0698bca3940ff 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -681,6 +681,7 @@ Syntax:
 
   declare void  @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
   declare void  @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
+
 Overview:
 """""""""
 

>From 8b8c1357e0810e23c9cb4df86b1be5b7438a1431 Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Fri, 28 Feb 2025 12:46:51 +0530
Subject: [PATCH 06/10] refine nvvm

---
 llvm/include/llvm/IR/IntrinsicsNVVM.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index b1999715abfa5..b624f9005bd72 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5084,12 +5084,12 @@ def int_nvvm_applypriority_L2_evict_normal
 
 // Intrinsics for discard
 def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[], 
-    [llvm_global_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, 
-    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
+    [llvm_global_ptr_ty, llvm_i64_ty], [NoCapture<ArgIndex<0>>,
+    ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
 
 def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[], 
-    [llvm_ptr_ty, llvm_i64_ty], [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, 
-    NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
+    [llvm_ptr_ty, llvm_i64_ty], [NoCapture<ArgIndex<0>>,
+    ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
 
 // Intrinsics for Bulk Copy using TMA (non-tensor)
 // From Global to Shared Cluster

>From 68540fd37a6ee249a2ce440f4ccf8efb9d8f57cb Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Fri, 28 Feb 2025 15:36:49 +0530
Subject: [PATCH 07/10] refine docs

---
 llvm/docs/NVPTXUsage.rst | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 0698bca3940ff..9de1937c3cbc7 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -685,14 +685,22 @@ Syntax:
 Overview:
 """""""""
 
-The '``@llvm.nvvm.discard.*``'  invalidates the data at the address range [a .. a + (size - 1)]
-abhilash1910 marked this conversation as resolved.
-in the cache level specified by the .level qualifier without writing back the data
-in the cache to the memory. The operand size is an integer constant that specifies the amount of data,
-in bytes, in the cache level specified by the .level qualifier to be discarded. The only supported value
-for the size operand is 128. If no state space is specified then Generic Addressing is used.
-If the specified address does not fall within the address window of .global state space then
-the behavior is undefined.
+The '``@llvm.nvvm.discard.*``' semantically behaves like a weak write of an *unstable indeterminate value*: 
+reads of memory locations with *unstable indeterminate values* may return different 
+bit patterns each time until the memory is overwritten.
+This operation *hints* to the implementation that data in the specified cache ``.level`` 
+can be destructively discarded without writing it back to memory. The operand ``size`` is an 
+integer constant that specifies the length in bytes of the address range ``[a, a + size)`` to write 
+*unstable indeterminate values* into. The only supported value for the ``size`` operand is ``128``. 
+If no state space is specified then `generic-addressing` is used. If the specified address does 
+not fall within the address window of ``.global`` state space then the behavior is undefined.
+
+.. code-block:: text
+  
+  discard.global.L2 [ptr], 128;
+  ld.weak.u32 r0, [ptr];
+  ld.weak.u32 r1, [ptr];
+  // The values in r0 and r1 may differ!
 
 For more information, refer to the PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`_.

>From 40bf9da193ca81e533da0b977ed1c68907b4aa00 Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Fri, 28 Feb 2025 16:13:29 +0530
Subject: [PATCH 08/10] refine docs

---
 llvm/docs/NVPTXUsage.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 9de1937c3cbc7..96806ae36cf33 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -695,6 +695,9 @@ integer constant that specifies the length in bytes of the address range ``[a, a
 If no state space is specified then `generic-addressing` is used. If the specified address does 
 not fall within the address window of ``.global`` state space then the behavior is undefined.
 
+LLVM does not define anywhere what an *unstable indeterminate values* is, and the closest concept 
+LLVM has breaks the example below:
+
 .. code-block:: text
   
   discard.global.L2 [ptr], 128;

>From 76128cee3b58e50ebeecda1ffd253dad980781f1 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910 at users.noreply.github.com>
Date: Fri, 28 Feb 2025 21:58:01 +0530
Subject: [PATCH 09/10] Merge from suggestion llvm/docs/NVPTXUsage.rst

Co-authored-by: gonzalobg <65027571+gonzalobg at users.noreply.github.com>
---
 llvm/docs/NVPTXUsage.rst | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 96806ae36cf33..12ff3d219a0da 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -685,25 +685,24 @@ Syntax:
 Overview:
 """""""""
 
-The '``@llvm.nvvm.discard.*``' semantically behaves like a weak write of an *unstable indeterminate value*: 
-reads of memory locations with *unstable indeterminate values* may return different 
-bit patterns each time until the memory is overwritten.
-This operation *hints* to the implementation that data in the specified cache ``.level`` 
-can be destructively discarded without writing it back to memory. The operand ``size`` is an 
-integer constant that specifies the length in bytes of the address range ``[a, a + size)`` to write 
-*unstable indeterminate values* into. The only supported value for the ``size`` operand is ``128``. 
-If no state space is specified then `generic-addressing` is used. If the specified address does 
-not fall within the address window of ``.global`` state space then the behavior is undefined.
-
-LLVM does not define anywhere what an *unstable indeterminate values* is, and the closest concept 
-LLVM has breaks the example below:
+The *effects* of the ``@llvm.nvvm.discard.L2*`` intrinsics are those of a non-atomic non-volatile ``llvm.memset`` that writes ``undef`` to the destination address range ``[%ptr, %ptr + immarg)``. 
+Subsequent reads from the address range may read ``undef`` until the memory is overwritten with a different value.
+These operations *hint* the implementation that data in the L2 cache can be destructively discarded without writing it back to memory. 
+The operand ``immarg`` is an integer constant that specifies the length in bytes of the address range ``[%ptr, %ptr + immarg)`` to write ``undef`` into. 
+The only supported value for the ``immarg`` operand is ``128``. 
+If generic addressing is used and the specified address does not fall within the address window of global memory (``addrspace(1)``) the behavior is undefined.
 
-.. code-block:: text
-  
-  discard.global.L2 [ptr], 128;
-  ld.weak.u32 r0, [ptr];
-  ld.weak.u32 r1, [ptr];
-  // The values in r0 and r1 may differ!
+.. code-block:: llvm
+ 
+   call void @llvm.nvvm.discard.L2(ptr %p, i64 128)  ;; writes `undef` to [p, p+128)
+   %a = load i64, ptr %p. ;; loads 8 bytes containing undef
+   %b = load i64, ptr %p  ;; loads 8 bytes containing undef
+   ;; comparing %a and %b compares `undef` values!
+   %fa = freeze i64 %a  ;; freezes undef to stable bit-pattern
+   %fb = freeze i64 %b  ;; freezes undef to stable bit-pattern
+   ;; %fa may compare different to %fb!
+   
+For more information, refer to the  `CUDA C++ discard documentation <https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_access_properties/discard_memory.html>`__ and the `PTX ISA discard documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`__ .
 
 For more information, refer to the PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`_.

>From 82a5a756476a7c8ae7ace6138f989aee6b3627c8 Mon Sep 17 00:00:00 2001
From: abmajumder <abmajumder at nvidia.com>
Date: Fri, 28 Feb 2025 22:02:41 +0530
Subject: [PATCH 10/10] refine docs

---
 llvm/docs/NVPTXUsage.rst | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 12ff3d219a0da..6b7f8d953774b 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -685,12 +685,18 @@ Syntax:
 Overview:
 """""""""
 
-The *effects* of the ``@llvm.nvvm.discard.L2*`` intrinsics are those of a non-atomic non-volatile ``llvm.memset`` that writes ``undef`` to the destination address range ``[%ptr, %ptr + immarg)``. 
-Subsequent reads from the address range may read ``undef`` until the memory is overwritten with a different value.
-These operations *hint* the implementation that data in the L2 cache can be destructively discarded without writing it back to memory. 
-The operand ``immarg`` is an integer constant that specifies the length in bytes of the address range ``[%ptr, %ptr + immarg)`` to write ``undef`` into. 
+The *effects* of the ``@llvm.nvvm.discard.L2*`` intrinsics are those of a non-atomic 
+non-volatile ``llvm.memset`` that writes ``undef`` to the destination 
+address range ``[%ptr, %ptr + immarg)``. 
+Subsequent reads from the address range may read ``undef`` until the memory is overwritten 
+with a different value.
+These operations *hint* the implementation that data in the L2 cache can be destructively 
+discarded without writing it back to memory. 
+The operand ``immarg`` is an integer constant that specifies the length in bytes of the 
+address range ``[%ptr, %ptr + immarg)`` to write ``undef`` into. 
 The only supported value for the ``immarg`` operand is ``128``. 
-If generic addressing is used and the specified address does not fall within the address window of global memory (``addrspace(1)``) the behavior is undefined.
+If generic addressing is used and the specified address does not fall within the 
+address window of global memory (``addrspace(1)``) the behavior is undefined.
 
 .. code-block:: llvm
  
@@ -702,10 +708,7 @@ If generic addressing is used and the specified address does not fall within the
    %fb = freeze i64 %b  ;; freezes undef to stable bit-pattern
    ;; %fa may compare different to %fb!
    
-For more information, refer to the  `CUDA C++ discard documentation <https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_access_properties/discard_memory.html>`__ and the `PTX ISA discard documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`__ .
-
-For more information, refer to the PTX ISA
-`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`_.
+For more information, refer to the  `CUDA C++ discard documentation <https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_access_properties/discard_memory.html>`__ and to the `PTX ISA discard documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`__ .
 
 '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



More information about the llvm-commits mailing list