[llvm] [NVPTX] Add im2colw/w128 modes support to TMA intrinsics (PR #148863)

Tue Jul 15 08:09:26 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-nvptx

Author: Durgadoss R (durga4github)

<details>
<summary>Changes</summary>

This patch adds support for the im2col-w/w128 and scatter/gather modes
for TMA Copy and Prefetch intrinsics, completing support for all the
available modes.

* lit tests are added for all the combinations and verified with a
  12.8 ptxas executable.
* Documentation is updated in the NVPTXUsage.rst file.


---

Patch is 226.88 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148863.diff


12 Files Affected:

- (modified) llvm/docs/NVPTXUsage.rst (+152-9) 
- (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+69-9) 
- (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+1) 
- (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+158-9) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll (+193) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll (+150) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll (+351) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll (+174) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll (+524) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll (+524) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll (+171) 
- (added) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll (+52) 


``````````diff

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 11017fe4e01b4..d28eb6860c33a 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -1072,6 +1072,8 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+
 Overview:
 """""""""
 
@@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
 in ``tile`` mode. In tile mode, the multi-dimensional layout of the
 source tensor is preserved at the destination. The dimension of the
 tensor data ranges from 1d to 5d with the coordinates specified
-by the ``i32 %d0 ... i32 %d4`` arguments.
+by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode,
+four rows in a 2D tensor are combined to form a single 2D destination
+tensor. The first coordinate ``i32 %x0`` denotes the column index
+followed by four coordinates indicating the four row-indices.
+So, this mode takes a total of 5 coordinates as input arguments.
+For more information on ``gather4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
 
 * The last three arguments to these intrinsics are flags
   indicating support for multicast, cache_hint and cta_group::1/2
@@ -1116,10 +1124,18 @@ Syntax:
 
 .. code-block:: llvm
 
-  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
 Overview:
 """""""""
 
@@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
 are unrolled into a single dimensional column at the destination. In this
 mode, the tensor has to be at least three-dimensional. Along with the tensor
 coordinates, im2col offsets are also specified (denoted by
-``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
-than the number of dimensions of the tensor operation. The last three arguments
-to these intrinsics are flags, with the same functionality as described
-in the ``tile`` mode intrinsics above.
+``i16 im2col0...i16 %im2col2``). For the ``im2col`` mode, the number of offsets
+is two less than the number of dimensions of the tensor operation. For the
+``im2col.w`` and ``im2col.w.128`` mode, the number of offsets is always 2,
+denoted by ``i16 %wHalo`` and ``i16 %wOffset`` arguments. For more information
+on ``im2col.w`` and ``im2col.w.128`` modes, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
+
+The last three arguments to these intrinsics are flags, with the same functionality
+as described in the ``tile`` mode intrinsics above.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*``
+set of PTX instructions. These instructions initiate an asynchronous
+copy of tensor data from global memory to shared::cta memory in
+``tile`` mode. In tile mode, the multi-dimensional layout of the
+source tensor is preserved at the destination. The dimension of the
+tensor data ranges from 1d to 5d with the coordinates specified
+by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode,
+four rows in a 2D tensor are combined to form a single 2D destination
+tensor. The first coordinate ``i32 %x0`` denotes the column index
+followed by four coordinates indicating the four row-indices.
+So, this mode takes a total of 5 coordinates as input arguments.
+For more information on ``gather4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' intrinsics
+correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*``
+set of PTX instructions. These instructions initiate an asynchronous copy
+of tensor data from global memory to shared::cta memory in ``im2col`` mode.
+In im2col mode, some dimensions of the source tensor are unrolled into a
+single dimensional column at the destination. In this mode, the tensor has
+to be at least three-dimensional. Along with the tensor coordinates, im2col
+offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``).
+For the ``im2col`` mode, the number of offsets is two less than the number
+of dimensions of the tensor operation. For the ``im2col.w`` and ``im2col.w.128``
+mode, the number of offsets is always 2, denoted by ``i16 %wHalo`` and
+``i16 %wOffset`` arguments. For more information on ``im2col.w`` and
+``im2col.w.128`` modes, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
 
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
@@ -1153,6 +1264,8 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
+
 Overview:
 """""""""
 
@@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from
 shared::cta to global memory (indicated by the ``s2g`` prefix)
 in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
 with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
+In ``tile.scatter4`` mode, a single 2D source tensor is divided into
+four rows in the 2D destination tensor. The first coordinate ``i32 %x0``
+denotes the column index followed by four coordinates indicating the
+four row-indices. So, this mode takes a total of 5 coordinates as input arguments.
+For more information on ``scatter4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
 
 * The last argument to these intrinsics is a boolean flag
   indicating support for cache_hint. This flag argument must
@@ -1214,6 +1333,8 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch)
+
 Overview:
 """""""""
 
@@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination.
 The dimension of the tensor data ranges from 1d to 5d with the coordinates
 specified by the ``i32 %d0 ... i32 %d4`` arguments.
 
+In ``tile.gather4`` mode, four rows in the 2-dimnesional source tensor are
+fetched to the L2 cache. The first coordinate ``i32 %x0`` denotes the column index
+followed by four coordinates indicating the four row-indices. So, this mode takes
+a total of 5 coordinates as input arguments.
+For more information on ``gather4`` mode, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-scatter4-gather4-modes>`_.
+
 * The last argument to these intrinsics is a boolean flag
   indicating support for cache_hint. This flag argument must
   be a compile-time constant. When set, it indicates a valid
@@ -1246,6 +1374,14 @@ Syntax:
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
   declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
 
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
 Overview:
 """""""""
 
@@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some
 dimensions of the source tensor are unrolled into a single dimensional
 column at the destination. In this mode, the tensor has to be at least
 three-dimensional. Along with the tensor coordinates, im2col offsets are
-also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
-of im2col offsets is two less than the number of dimensions of the tensor
-operation. The last argument to these intrinsics is a boolean flag, with
+also specified (denoted by ``i16 im2col0...i16 %im2col2``). For ``im2col``
+mode, the number of offsets is two less than the number of dimensions of
+the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` modes,
+the number of offsets is always 2, denoted by ``i16 %wHalo`` and
+``i16 %wOffset`` arguments. For more information on ``im2col.w`` and
+``im2col.w.128`` modes, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-w-w128-modes>`_.
+
+
+The last argument to these intrinsics is a boolean flag, with
 the same functionality as described in the ``tile`` mode intrinsics above.
 
 For more information, refer PTX ISA
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 0375f29ad8906..5ddc14445908b 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2024,9 +2024,7 @@ foreach dim = 1...5 in {
                       tensor_dim_args,      // actual tensor dims
                       [llvm_i64_ty]),       // cache_hint
           [llvm_i1_ty],                     // Flag for cache_hint
-          [IntrConvergent,
-           ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
 
     // Intrinsics for TMA Copy with reduction
     foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in
@@ -2037,18 +2035,31 @@ foreach dim = 1...5 in {
                          tensor_dim_args,     // actual tensor dims
                         [llvm_i64_ty]),       // cache_hint
           [llvm_i1_ty],                       // Flag for cache_hint
-          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
-           NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
+          [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
   }
 }
 
+// TMA S2G tile::scatter4
+def int_nvvm_cp_async_bulk_tensor_s2g_tile_scatter4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat([llvm_shared_ptr_ty,        // src_smem_ptr
+                   llvm_ptr_ty],              // tensormap_ptr
+                  !listsplat(llvm_i32_ty, 5), // dims
+                  [llvm_i64_ty]),             // cache_hint
+      [llvm_i1_ty],                           // Flag for cache_hint
+      [IntrConvergent, ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>]>;
+
 // TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants
 foreach dim = 1...5 in {
   defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim);
 
-  foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
+  foreach mode = !if(!ge(dim, 3), ["tile", "im2col", "im2col_w", "im2col_w_128"], ["tile"]) in {
     defvar is_im2col = !eq(mode, "im2col");
-    defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0);
+    defvar is_im2colw = !or(!eq(mode, "im2col_w"), !eq(mode, "im2col_w_128"));
+
+    // For im2col_w/w128 modes, the num_offsets is always 2.
+    // For im2col mode, the num_offsets is (dim - 2).
+    defvar num_im2col_offsets = !if(is_im2colw, 2, !if(is_im2col, !add(dim, -2), 0));
     defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets);
 
     defvar g2s_params = !listconcat(
@@ -2079,11 +2090,60 @@ foreach dim = 1...5 in {
                        im2col_offsets_args, // im2col offsets
                       [llvm_i64_ty]),       // cache_hint
           [llvm_i1_ty],                     // Flag for cache_hint
-          [IntrConvergent,
-           ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
+          [IntrConvergent, ReadOnly<ArgIndex<0>>]>;
+
+    def int_nvvm_cp_async_bulk_tensor_g2s_cta_ # mode # _ # dim # d :
+      DefaultAttrsIntrinsicFlags<[],
+          !listconcat([llvm_shared_ptr_ty,  // dst_ptr
+                       llvm_shared_ptr_ty,  // mbarrier_ptr
+                       llvm_ptr_ty],        // tensormap_ptr
+                       tensor_dim_args,     // actual tensor dims
+                       im2col_offsets_args, // im2col offsets
+                       [llvm_i64_ty]),      // cache_hint
+          [llvm_i1_ty],                     // Flag for cache_hint
+          [IntrConvergent, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
   }
 }
 
+// TMA copy for tile::gather4
+def int_nvvm_cp_async_bulk_tensor_g2s_tile_gather4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat(
+      [llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
+       llvm_shared_ptr_ty,         // mbarrier_ptr
+       llvm_ptr_ty],               // tensormap_ptr
+       !listsplat(llvm_i32_ty, 5), // co-ordinates
+      [llvm_i16_ty,                // cta_mask
+       llvm_i64_ty]),              // cache_hint
+      [llvm_i1_ty,                 // Flag for cta_mask
+       llvm_i1_ty,                 // Flag for cache_hint
+       llvm_i32_ty],               // Flag for cta_group
+      [IntrConvergent,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
+       // Allowed values for cta_group are {0,1,2} i.e [0, 3).
+       Range<ArgIndex<12>, 0, 3>]>;
+
+def int_nvvm_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat(
+      [llvm_shared_ptr_ty,         // dst_shared_ptr
+       llvm_shared_ptr_ty,         // mbarrier_ptr
+       llvm_ptr_ty],               // tensormap_ptr
+       !listsplat(llvm_i32_ty, 5), // co-ordinates
+      [llvm_i64_ty]),              // cache_hint
+      [llvm_i1_ty],                // Flag for cache_hint
+      [IntrConvergent,
+       WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>]>;
+
+// TMA prefetch for tile::gather4
+def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d
+  : DefaultAttrsIntrinsicFlags<[],
+      !listconcat([llvm_ptr...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/148863