[llvm] [NVPTX] Add TMA non-tensor variant of g2s-cta intrinsic (PR #167508)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 11 06:17:21 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-nvptx
Author: Durgadoss R (durga4github)
<details>
<summary>Changes</summary>
This patch adds a TMA intrinsic for Global to
shared::cta copy, which was introduced with ptx86.
Also remove the NoCapture<> annotation from the
pointer arguments to these intrinsics, since the
copy operations are asynchronous in nature.
lit tests are verified with a ptxas from cuda-12.8.
---
Full diff: https://github.com/llvm/llvm-project/pull/167508.diff
4 Files Affected:
- (modified) llvm/docs/NVPTXUsage.rst (+26)
- (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+15-7)
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+19)
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll (+46)
``````````diff
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 5ad8f9ab07e40..39f0556aef5a2 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1334,6 +1334,32 @@ copied and it must be a multiple of 16.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+'``llvm.nvvm.cp.async.bulk.global.to.shared.cta``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.global.to.shared.cta``' intrinsic
+corresponds to the ``cp.async.bulk.shared::cta.global.*`` family
+of PTX instructions. These instructions initiate an asynchronous
+copy of bulk data from global memory to shared::cta memory.
+The 32-bit operand ``%size`` specifies the amount of memory to be
+copied and it must be a multiple of 16. The last argument
+(denoted by ``i1 %flag_ch``) is a compile-time constant. When set,
+it indicates a valid cache_hint (``i64 %ch``) and generates the
+``.L2::cache_hint`` variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
+
'``llvm.nvvm.cp.async.bulk.shared.cta.to.global``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 2710853e17688..21badc2692037 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2716,8 +2716,19 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
[llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
- WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>]>;
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
+
+// From Global to Shared CTA
+def int_nvvm_cp_async_bulk_global_to_shared_cta
+ : DefaultAttrsIntrinsicFlags<[],
+ [llvm_shared_ptr_ty, // dst_shared_cta_ptr
+ llvm_shared_ptr_ty, // mbarrier_ptr
+ llvm_global_ptr_ty, // src_gmem_ptr
+ llvm_i32_ty, // copy_size
+ llvm_i64_ty], // cache_hint
+ [llvm_i1_ty], // Flag for cache_hint
+ [IntrConvergent, IntrArgMemOnly,
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
// From Shared CTA to Shared Cluster
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
@@ -2727,9 +2738,7 @@ def int_nvvm_cp_async_bulk_shared_cta_to_cluster
llvm_shared_ptr_ty, // src_smem_ptr
llvm_i32_ty], // copy_size
[IntrConvergent, IntrArgMemOnly,
- WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
- NoCapture<ArgIndex<2>>]>;
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
// From Shared CTA to Global memory
def int_nvvm_cp_async_bulk_shared_cta_to_global
@@ -2740,8 +2749,7 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
llvm_i64_ty], // cache_hint
[llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
- WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
- NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+ WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
// From Shared CTA to Global memory with bytemask
def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 50827bd548ad5..ea69a54e6db37 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -497,6 +497,10 @@ class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
# !if(mc, ".multicast::cluster", "")
# !if(ch, ".L2::cache_hint", "");
+ // Global to Shared CTA memory
+ string G2S_CTA = "cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes"
+ # !if(ch, ".L2::cache_hint", "");
+
// Shared CTA to Cluster memory
string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
}
@@ -543,6 +547,21 @@ multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR<has_ch = 0>;
defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR<has_ch = 1>;
+multiclass CP_ASYNC_BULK_G2S_CTA_INTR<bit has_ch> {
+ defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cta;
+
+ def "" : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
+ B32:$size, B64:$ch),
+ !if(has_ch,
+ CpAsyncBulkStr<0, 1>.G2S_CTA # " [$dst], [$src], $size, [$mbar], $ch;",
+ CpAsyncBulkStr<0, 0>.G2S_CTA # " [$dst], [$src], $size, [$mbar];"),
+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
+ Requires<[hasPTX<86>, hasSM<90>]>;
+}
+defm CP_ASYNC_BULK_G2S_CTA : CP_ASYNC_BULK_G2S_CTA_INTR<has_ch = 0>;
+defm CP_ASYNC_BULK_G2S_CTA_CH : CP_ASYNC_BULK_G2S_CTA_INTR<has_ch = 1>;
+
def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
(ins ADDR:$dst, ADDR:$mbar, ADDR:$src, B32:$size),
CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];",
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll
new file mode 100644
index 0000000000000..9872b2aa0826b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-ptx86.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK,CHECK-PTX64 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK,CHECK-PTX-SHARED32 %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i64, i1)
+
+define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
+; CHECK-PTX64-NEXT: ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s(
+; CHECK-PTX-SHARED32: {
+; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT: // %bb.0:
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_4];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
+; CHECK-PTX-SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cta(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
``````````
</details>
https://github.com/llvm/llvm-project/pull/167508
More information about the llvm-commits
mailing list