[llvm] 6a961dc - [NVPTX] Add intrinsics for prefetch.* (#125887)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 11 00:54:50 PST 2025
Author: Abhilash Majumder
Date: 2025-02-11T14:24:46+05:30
New Revision: 6a961dc03d802fb1f34b2b0cd8d09f427382dbdb
URL: https://github.com/llvm/llvm-project/commit/6a961dc03d802fb1f34b2b0cd8d09f427382dbdb
DIFF: https://github.com/llvm/llvm-project/commit/6a961dc03d802fb1f34b2b0cd8d09f427382dbdb.diff
LOG: [NVPTX] Add intrinsics for prefetch.* (#125887)
\[NVPTX\] Add Prefetch intrinsics
This PR adds prefetch intrinsics with the relevant eviction priorities.
* Lit tests are added as part of prefetch.ll
* The generated PTX is verified with a 12.3 ptxas executable.
* Added docs for these intrinsics in NVPTXUsage.rst.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
---------
Co-authored-by: abmajumder <abmajumder at nvidia.com>
Added:
llvm/test/CodeGen/NVPTX/prefetch.ll
Modified:
llvm/docs/NVPTXUsage.rst
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Removed:
################################################################################
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index dcd0a3ac3639b8..1680b114335370 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -581,6 +581,46 @@ prefetched in terms of bytes and it must be a multiple of 16.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_.
+'``llvm.nvvm.prefetch.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr)
+ declare void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr)
+
+ declare void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr)
+ declare void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr)
+ declare void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr)
+ declare void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr)
+
+ declare void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr)
+ declare void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr)
+
+ declare void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic
+correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions.
+The '``prefetch.*``' instructions bring the cache line containing the
+specified address in global or local memory address space into the
+specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line
+containing the specified generic address into the specified uniform cache level.
+If no address space is specified, it is assumed to be generic address. The intrinsic
+uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier.
+
+* A prefetch to a shared memory location performs no operation.
+* A prefetch into the uniform cache requires a generic address,
+ and no operation occurs if the address maps to a const, local, or shared memory location.
+
+For more information, refer to the PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index f299a145ac73b1..6af1f2a1667734 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -48,6 +48,7 @@
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
+def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
//
@@ -4999,6 +5000,26 @@ foreach dim = [1, 2, 3, 4, 5] in {
}
}
+// Intrinsics for Prefetch and Prefetchu
+foreach level = ["L1", "L2"] in {
+ foreach addr = ["global", "local", ""] in {
+ foreach evict = !if(!eq(addr, "global"), ["evictlast", "evictnormal"], ["evictnormal"]) in {
+ defvar suffix = "" # !if(!eq(addr, ""), "", addr # "_") # level # "_" # evict;
+ def int_nvvm_prefetch_ # suffix : Intrinsic<[],
+ !cond(
+ !eq(addr, "global") : [llvm_global_ptr_ty],
+ !eq(addr, "local") : [llvm_local_ptr_ty],
+ !eq(addr, "") : [llvm_ptr_ty]),
+ [IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
+ NoCapture<ArgIndex<0>>]>;
+ }
+ }
+}
+
+def int_nvvm_prefetchu_L1_evictnormal : Intrinsic<[], [llvm_ptr_ty],
+ [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+
+
// Intrinsics for Bulk Copy using TMA (non-tensor)
// From Global to Shared Cluster
def int_nvvm_cp_async_bulk_global_to_shared_cluster
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 5331f36ad09997..39dac65d67eb96 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -740,6 +740,33 @@ foreach dim = [1, 2, 3, 4, 5] in {
}
}
+//Prefetch and Prefetchu
+class Join<string sep, list<string> lst> {
+ string ret = !foldl("", lst, a, b, !if(!eq(a, ""), b, !if(!eq(b,""), a, !strconcat(a, sep, b))));
+}
+
+class PREFETCH_INTRS<string InstName> :
+ NVPTXInst<(outs), (ins Int64Regs:$addr),
+ InstName # " [$addr];",
+ [(!cast<Intrinsic>(!strconcat("int_nvvm_",
+ !subst(".", "_", InstName))) i64:$addr)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+
+// Only global supports evictlast and evictnormal.
+// Other variants (local and default) only support evictnormal
+foreach level = ["L1", "L2"] in {
+ foreach addr = ["global", "local", ""] in {
+ foreach evict = !if(!eq(addr, "global"), ["evictlast", "evictnormal"], ["evictnormal"]) in {
+ defvar suffix = Join<"_", [addr, level, evict]>.ret;
+ defvar inst_name = "prefetch." # !subst("_", ".", suffix);
+ def PREFETCH_# suffix : PREFETCH_INTRS<inst_name>;
+ }
+ }
+}
+
+def PREFETCHU_L1_EVICTNORMAL : PREFETCH_INTRS<"prefetchu.L1.evictnormal">;
+
//-----------------------------------
// MBarrier Functions
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll
new file mode 100644
index 00000000000000..cf47000ffd9aaa
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/prefetch.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr)
+declare void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr)
+
+declare void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr)
+declare void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr)
+declare void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr)
+declare void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr)
+
+declare void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr)
+declare void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr)
+
+declare void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr)
+
+define void @prefetch_local(ptr addrspace(5) %local_ptr) {
+; CHECK-PTX64-LABEL: prefetch_local(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_local_param_0];
+; CHECK-PTX64-NEXT: prefetch.local.L1.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: prefetch.local.L2.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr)
+ tail call void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr)
+ ret void
+}
+
+define void @prefetch_global(ptr addrspace(1) %global_ptr) {
+; CHECK-PTX64-LABEL: prefetch_global(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_global_param_0];
+; CHECK-PTX64-NEXT: prefetch.global.L1.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: prefetch.global.L2.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: prefetch.global.L1.evictlast [%rd1];
+; CHECK-PTX64-NEXT: prefetch.global.L2.evictlast [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr)
+ tail call void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr)
+ tail call void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr)
+ tail call void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr)
+ ret void
+}
+
+
+define void @prefetch_(ptr %ptr) {
+; CHECK-PTX64-LABEL: prefetch_(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch__param_0];
+; CHECK-PTX64-NEXT: prefetch.L1.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: prefetch.L2.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr)
+ tail call void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr)
+ ret void
+}
+
+define void @prefetchu_l1(ptr %ptr) {
+; CHECK-PTX64-LABEL: prefetchu_l1(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetchu_l1_param_0];
+; CHECK-PTX64-NEXT: prefetchu.L1.evictnormal [%rd1];
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr)
+ ret void
+}
\ No newline at end of file
More information about the llvm-commits
mailing list