[llvm] 7c58089 - [NVPTX] Add Intrinsics for discard.* (#128404)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 3 09:23:53 PST 2025
Author: Abhilash Majumder
Date: 2025-03-03T22:53:49+05:30
New Revision: 7c580893ea662b513da71a3da9ae4ab1b2dafc6b
URL: https://github.com/llvm/llvm-project/commit/7c580893ea662b513da71a3da9ae4ab1b2dafc6b
DIFF: https://github.com/llvm/llvm-project/commit/7c580893ea662b513da71a3da9ae4ab1b2dafc6b.diff
LOG: [NVPTX] Add Intrinsics for discard.* (#128404)
[NVPTX] Add Intrinsics for discard.*
This PR adds intrinsics for all variations of discard.*
* These intrinsics supports generic or global for all variations.
* The lowering is handled from nvvm to nvptx tablegen directly.
* Lit tests are added as part of discard.ll
* The generated PTX is verified with a 12.3 ptxas executable.
* Added docs for these intrinsics in NVPTXUsage.rst.
For more information, refer to the PTX ISA
<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>_.
---------
Co-authored-by: abmajumder <abmajumder at nvidia.com>
Co-authored-by: gonzalobg <65027571+gonzalobg at users.noreply.github.com>
Added:
llvm/test/CodeGen/NVPTX/discard.ll
Modified:
llvm/docs/NVPTXUsage.rst
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Removed:
################################################################################
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 0403bbc1eabb7..bf5bcb6111829 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -671,6 +671,45 @@ level on which the priority is to be applied. The only supported value for the s
For more information, refer to the PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-applypriority>`_.
+``llvm.nvvm.discard.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg)
+ declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg)
+
+Overview:
+"""""""""
+
+The *effects* of the ``@llvm.nvvm.discard.L2*`` intrinsics are those of a non-atomic
+non-volatile ``llvm.memset`` that writes ``undef`` to the destination
+address range ``[%ptr, %ptr + immarg)``. The ``%ptr`` must be aligned by 128 bytes.
+Subsequent reads from the address range may read ``undef`` until the memory is overwritten
+with a
diff erent value.
+These operations *hint* the implementation that data in the L2 cache can be destructively
+discarded without writing it back to memory.
+The operand ``immarg`` is an integer constant that specifies the length in bytes of the
+address range ``[%ptr, %ptr + immarg)`` to write ``undef`` into.
+The only supported value for the ``immarg`` operand is ``128``.
+If generic addressing is used and the specified address does not fall within the
+address window of global memory (``addrspace(1)``) the behavior is undefined.
+
+.. code-block:: llvm
+
+ call void @llvm.nvvm.discard.L2(ptr %p, i64 128) ;; writes `undef` to [p, p+128)
+ %a = load i64, ptr %p. ;; loads 8 bytes containing undef
+ %b = load i64, ptr %p ;; loads 8 bytes containing undef
+ ;; comparing %a and %b compares `undef` values!
+ %fa = freeze i64 %a ;; freezes undef to stable bit-pattern
+ %fb = freeze i64 %b ;; freezes undef to stable bit-pattern
+ ;; %fa may compare
diff erent to %fb!
+
+For more information, refer to the `CUDA C++ discard documentation <https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_access_properties/discard_memory.html>`__ and to the `PTX ISA discard documentation <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-discard>`__ .
+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 72b05bca08940..b624f9005bd72 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5082,6 +5082,14 @@ def int_nvvm_applypriority_L2_evict_normal
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<1>>]>;
+// Intrinsics for discard
+def int_nvvm_discard_global_L2 : DefaultAttrsIntrinsic<[],
+ [llvm_global_ptr_ty, llvm_i64_ty], [NoCapture<ArgIndex<0>>,
+ ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
+
+def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
+ [llvm_ptr_ty, llvm_i64_ty], [NoCapture<ArgIndex<0>>,
+ ImmArg<ArgIndex<1>>, IntrHasSideEffects]>;
// Intrinsics for Bulk Copy using TMA (non-tensor)
// From Global to Shared Cluster
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 7d7e69adafcd0..c6be31e20f643 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -800,6 +800,17 @@ class APPLYPRIORITY_L2_INTRS<string addr> :
def APPLYPRIORITY_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"">;
def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
+//Discard Intrinsics
+class DISCARD_L2_INTRS<string Addr> :
+ NVPTXInst<(outs), (ins Int64Regs:$addr),
+ StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;",
+ [(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret)
+ i64:$addr, (i64 128))]>,
+ Requires<[hasPTX<74>, hasSM<80>]>;
+
+def DISCARD_L2 : DISCARD_L2_INTRS<"">;
+def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">;
+
//-----------------------------------
// MBarrier Functions
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll
new file mode 100644
index 0000000000000..8e5c9bab97c8d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/discard.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 immarg %size)
+declare void @llvm.nvvm.discard.L2(ptr %ptr, i64 immarg %size)
+
+define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
+; CHECK-PTX64-LABEL: discard_global_L2(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_global_L2_param_0];
+; CHECK-PTX64-NEXT: discard.global.L2 [%rd1], 128;
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128)
+ ret void
+}
+
+define void @discard_L2(ptr %ptr) {
+; CHECK-PTX64-LABEL: discard_L2(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_L2_param_0];
+; CHECK-PTX64-NEXT: discard.L2 [%rd1], 128;
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128)
+ ret void
+}
+
More information about the llvm-commits
mailing list