[llvm] fa7f0e5 - [NVPTX] Add Bulk Copy Prefetch Intrinsics (#123226)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 23 03:19:47 PST 2025
Author: Abhilash Majumder
Date: 2025-01-23T16:49:44+05:30
New Revision: fa7f0e582bc25a91d89dab7c488a1619060f9bef
URL: https://github.com/llvm/llvm-project/commit/fa7f0e582bc25a91d89dab7c488a1619060f9bef
DIFF: https://github.com/llvm/llvm-project/commit/fa7f0e582bc25a91d89dab7c488a1619060f9bef.diff
LOG: [NVPTX] Add Bulk Copy Prefetch Intrinsics (#123226)
This patch adds NVVM intrinsics and NVPTX codegen for:
- cp.async.bulk.prefetch.L2.* variants
- These intrinsics optionally support cache_hints as indicated by the
boolean flag argument.
- Lit tests are added for all combinations of these intrinsics in
cp-async-bulk.ll.
- The generated PTX is verified with a 12.3 ptxas executable.
- Added docs for these intrinsics in NVPTXUsage.rst file.
PTX Spec reference:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch
Co-authored-by: abmajumder <abmajumder at nvidia.com>
Added:
Modified:
llvm/docs/NVPTXUsage.rst
llvm/include/llvm/IR/IntrinsicsNVVM.td
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
Removed:
################################################################################
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 25a230f65fd3dd..a5a78a2882eec3 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -553,6 +553,34 @@ it must be a multiple of 16.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+'``llvm.nvvm.cp.async.bulk.prefetch.L2``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.prefetch.L2``' intrinsic
+corresponds to the ``cp.async.bulk.prefetch.L2.*`` family
+of PTX instructions. These instructions initiate an asynchronous
+prefetch of bulk data from global memory to the L2 cache.
+The 32-bit operand ``%size`` specifies the amount of memory to be
+prefetched in terms of bytes and it must be a multiple of 16.
+
+* The last argument to these intrinsics is boolean flag indicating
+ support for cache_hint. These flag argument must be compile-time
+ constant. When set, it indicates a valid cache_hint (``i64 %ch``)
+ and generates the ``.L2::cache_hint`` variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_.
+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 00a76018d8415d..00c441920bfa1c 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5033,4 +5033,15 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<4>>]>;
+// Intrinsics for Bulk Copy Prefetch L2
+def int_nvvm_cp_async_bulk_prefetch_L2
+ : DefaultAttrsIntrinsic<[],
+ [llvm_global_ptr_ty, // src_gmem_ptr
+ llvm_i32_ty, // copy_size
+ llvm_i64_ty, // cache_hint
+ llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent, IntrArgMemOnly,
+ NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
+ ImmArg<ArgIndex<3>>]>;
+
} // let TargetPrefix = "nvvm"
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 8f6adf2c22f922..ac8ce05724750c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -3168,6 +3168,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) {
+ // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
+ // src, size, cache_hint, cache_hint_flag
+ // NumOperands = {Chain, IID} + {Actual intrinsic args}
+ // = {2} + {4}
+ size_t NumOps = N->getNumOperands();
+ bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
+ size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint
+
+ SDLoc DL(N);
+ SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
+ Ops.push_back(N->getOperand(0)); // Chain operand
+
+ unsigned Opcode = IsCacheHint
+ ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
+ : NVPTX::CP_ASYNC_BULK_PREFETCH;
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
unsigned IID = N->getConstantOperandVal(1);
using TMARedTy = llvm::nvvm::TMAReductionOp;
@@ -3181,6 +3200,9 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
SelectCpAsyncBulkS2G(N);
return true;
+ case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
+ SelectCpAsyncBulkPrefetchL2(N);
+ return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 7661f153238fcd..8dc6bc86c68281 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -93,6 +93,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
void SelectCpAsyncBulkS2G(SDNode *N);
+ void SelectCpAsyncBulkPrefetchL2(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 48d75728aef8e2..6198c4aa9b94cb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -547,6 +547,18 @@ multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;
+//------------------------------
+// Bulk Copy Prefetch Functions
+//------------------------------
+def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
+ (ins Int64Regs:$src, Int32Regs:$size),
+ "cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
+ (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
+ "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
//-------------------------------------
// TMA Async Bulk Tensor Copy Functions
//-------------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
index aefd18a0632a08..cbb53df4a49b09 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
@@ -9,6 +9,7 @@ target triple = "nvptx64-nvidia-cuda"
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
+declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1), i32, i64, i1)
define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) {
; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
@@ -116,3 +117,21 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
ret void
}
+
+define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_prefetch(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0];
+; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1];
+; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2];
+; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
+; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1;
+; CHECK-PTX64-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
+ tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0)
+ ret void
+}
More information about the llvm-commits
mailing list