[llvm] [NVPTX] Add TMA Bulk Copy Intrinsics (PR #138679)
Durgadoss R via llvm-commits
llvm-commits at lists.llvm.org
Wed May 7 01:08:21 PDT 2025
https://github.com/durga4github updated https://github.com/llvm/llvm-project/pull/138679
>From 5c97fa7decc4da96e267d10a7d73831c0ab9176d Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr at nvidia.com>
Date: Fri, 28 Feb 2025 17:52:16 +0530
Subject: [PATCH] [NVPTX] Add TMA Bulk Copy Intrinsics
This patch adds a new variant of TMA Bulk Copy
intrinsics introduced in sm100+. This variant
has an additional byte_mask to select the bytes
for the copy operation.
Signed-off-by: Durgadoss R <durgadossr at nvidia.com>
---
llvm/docs/NVPTXUsage.rst | 6 ++-
llvm/include/llvm/IR/IntrinsicsNVVM.td | 13 +++++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 49 +++++++++++++------
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 2 +-
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 35 ++++++++-----
.../CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll | 46 +++++++++++++++++
6 files changed, 121 insertions(+), 30 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index c1426823d87af..2a091e27358eb 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -616,6 +616,7 @@ Syntax:
.. code-block:: llvm
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch)
+ declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i16 %mask, i64 %ch, i1 %flag_ch)
Overview:
"""""""""
@@ -624,7 +625,10 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic
corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX
instructions. These instructions initiate an asynchronous copy from
shared::cta to global memory. The 32-bit operand ``%size`` specifies
-the amount of memory to be copied and it must be a multiple of 16.
+the amount of memory to be copied (in bytes) and it must be a multiple
+of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand
+specifies whether the i-th byte of each 16-byte wide chunk of source
+data is copied to the destination.
* The last argument to these intrinsics is a boolean flag
indicating support for cache_hint. This flag argument must
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 8b87822d3fdda..21a0d58f09b62 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -5323,6 +5323,19 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
ImmArg<ArgIndex<4>>]>;
+// From Shared CTA to Global memory with bytemask
+def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
+ : DefaultAttrsIntrinsic<[],
+ [llvm_global_ptr_ty, // dst_gmem_ptr
+ llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_i32_ty, // copy_size
+ llvm_i16_ty, // byte_mask
+ llvm_i64_ty, // cache_hint
+ llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent, IntrArgMemOnly,
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
+ ImmArg<ArgIndex<5>>]>;
+
// Intrinsics for Bulk Copy Prefetch L2
def int_nvvm_cp_async_bulk_prefetch_L2
: DefaultAttrsIntrinsic<[],
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6f6084b99dda2..60035d47262b5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2720,28 +2720,44 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2GCommon(SDNode *N, bool HasMask) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
- // dst, src, size, cache_hint, cache_hint_flag
+ // dst, src, size, mask, cache_hint, cache_hint_flag
// NumOperands = {Chain, IID} + {Actual intrinsic args}
- // = {2} + {5}
+ // = {2} + {6}
size_t NumOps = N->getNumOperands();
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
- size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint
+ size_t CacheHintIdx = NumOps - 2;
SDLoc DL(N);
- SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
- Ops.push_back(N->getOperand(0)); // Chain operand
+ SDValue DstBase, DstOffset, SrcBase, SrcOffset;
+ SelectADDR(N->getOperand(2), DstBase, DstOffset); // dst
+ SelectADDR(N->getOperand(3), SrcBase, SrcOffset); // src
- bool IsShared32 =
- CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
- unsigned Opcode;
+ // BaseArgs: {dst, src, size}
+ SmallVector<SDValue, 8> Ops{DstBase, DstOffset, SrcBase, SrcOffset,
+ N->getOperand(4)};
+
+ // Push Mask operand, if available
+ if (HasMask)
+ Ops.push_back(N->getOperand(CacheHintIdx - 1));
+
+ // Push CacheHint operand, if available
if (IsCacheHint)
- Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
- : NVPTX::CP_ASYNC_BULK_S2G_CH;
- else
- Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
- : NVPTX::CP_ASYNC_BULK_S2G;
+ Ops.push_back(N->getOperand(CacheHintIdx));
+
+ // Finally, the chain operand
+ Ops.push_back(N->getOperand(0));
+
+ unsigned Opcode = [&]() {
+ if (HasMask && IsCacheHint)
+ return NVPTX::CP_ASYNC_BULK_S2G_BM_CH;
+ if (HasMask)
+ return NVPTX::CP_ASYNC_BULK_S2G_BM;
+ if (IsCacheHint)
+ return NVPTX::CP_ASYNC_BULK_S2G_CH;
+ return NVPTX::CP_ASYNC_BULK_S2G;
+ }();
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}
@@ -2928,7 +2944,10 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
SelectCpAsyncBulkG2S(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
- SelectCpAsyncBulkS2G(N);
+ SelectCpAsyncBulkS2GCommon(N);
+ return true;
+ case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global_bytemask:
+ SelectCpAsyncBulkS2GCommon(N, /*HasMask=*/true);
return true;
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
SelectCpAsyncBulkPrefetchL2(N);
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 23cbd458571a0..53f6c060405e7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -93,7 +93,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
- void SelectCpAsyncBulkS2G(SDNode *N);
+ void SelectCpAsyncBulkS2GCommon(SDNode *N, bool HasMask = false);
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 3eedb43e4c81a..6f6559e5c269d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -511,10 +511,11 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
// TMA Async Bulk Copy Functions
//------------------------------
-class CpAsyncBulkStr<bit mc, bit ch> {
+class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
// Shared to Global memory
string S2G = "cp.async.bulk.global.shared::cta.bulk_group"
- # !if(ch, ".L2::cache_hint", "");
+ # !if(ch, ".L2::cache_hint", "")
+ # !if(mask, ".cp_mask", "");
// Global to Shared cluster memory
string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes"
@@ -525,18 +526,26 @@ class CpAsyncBulkStr<bit mc, bit ch> {
string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
}
-multiclass CP_ASYNC_BULK_S2G<NVPTXRegClass rc> {
- def NAME: NVPTXInst<(outs),
- (ins Int64Regs:$dst, rc:$src, Int32Regs:$size),
- !strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>,
+def CP_ASYNC_BULK_S2G : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$src, Int32Regs:$size),
+ CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;", []>,
Requires<[hasPTX<80>, hasSM<90>]>;
- def NAME # _CH: NVPTXInst<(outs),
- (ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch),
- !strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
-}
-defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G<Int64Regs>;
-defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G<Int32Regs>;
+
+def CP_ASYNC_BULK_S2G_CH : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+ CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;", []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+// Variants with bytemask
+def CP_ASYNC_BULK_S2G_BM : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int16Regs:$mask),
+ CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;", []>,
+ Requires<[hasPTX<86>, hasSM<100>]>;
+
+def CP_ASYNC_BULK_S2G_BM_CH : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
+ CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;", []>,
+ Requires<[hasPTX<86>, hasSM<100>]>;
multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
def NAME: NVPTXInst<(outs),
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll
new file mode 100644
index 0000000000000..c2e2b26ed6882
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i16, i64, i1)
+
+define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
+; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
+; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_bytemask_param_1];
+; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_bytemask_param_2];
+; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_s2g_bytemask_param_3];
+; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_s2g_bytemask_param_4];
+; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1;
+; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1;
+; CHECK-PTX64-NEXT: ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask(
+; CHECK-PTX-SHARED32: {
+; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT: // %bb.0:
+; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
+; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_bytemask_param_1];
+; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_s2g_bytemask_param_2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_s2g_bytemask_param_3];
+; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_bytemask_param_4];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1;
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1;
+; CHECK-PTX-SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 %ch, i1 1)
+ tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 0, i1 0)
+ ret void
+}
More information about the llvm-commits
mailing list