[llvm] [NVPTX] Add TMA bulk tensor copy intrinsics (PR #96083)
Durgadoss R via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 19 08:08:47 PDT 2024
https://github.com/durga4github updated https://github.com/llvm/llvm-project/pull/96083
>From d1526818d36637679b42c37f03b0deba073a85f4 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr at nvidia.com>
Date: Wed, 17 Jul 2024 05:33:50 -0700
Subject: [PATCH] [NVPTX] Add TMA bulk tensor copy intrinsics
This patch adds NVVM intrinsics and NVPTX codeGen for:
* cp.async.bulk.tensor.S2G.1D -> 5D variants, with optional
support for cache_hints.
* cp.async.bulk.tensor.G2S.1D -> 5D variants, with optional
support for multicast and cache_hints. Moreover, the 3D->5D
variants also have support for an 'im2col' mode, with its
own set of offsets.
* The first argument of these intrinsics is an immediate
i32-flag. The bit-fields of the flag control enabling
optional features like multicast, cache_hints and im2col
offsets when applicable. The backend looks through these
flag-bits and lowers to the appropriate PTX instruction.
* Lit tests are added for all combinations of these intrinsics
in cp-async-bulk-tensor-g2s/s2g.ll.
* The generated PTX is verified with a 12.3 ptxas executable.
TODO: Update documentation for these intrinsics in NVPTX guide.
Signed-off-by: Durgadoss R <durgadossr at nvidia.com>
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 54 +++
llvm/include/llvm/IR/NVVMIntrinsicFlags.h | 40 ++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 238 ++++++++++
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 3 +
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 121 +++++
.../CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll | 431 ++++++++++++++++++
.../CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll | 232 ++++++++++
7 files changed, 1119 insertions(+)
create mode 100644 llvm/include/llvm/IR/NVVMIntrinsicFlags.h
create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 854eb2f8dd6df..527d17b6608d7 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -552,6 +552,22 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
[OpType, llvm_i32_ty, llvm_i32_ty]);
}
+class NVVM_INTRINSIC_RECORD<string intr> {
+ string record = !subst(".", "_", !subst("llvm.", "int_", intr));
+}
+
+class NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_NAME<int dim> {
+ string intr = "llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem"
+ # "." # dim # "d";
+ string record = NVVM_INTRINSIC_RECORD<intr>.record;
+}
+
+class NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_NAME<int dim> {
+ string intr = "llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem"
+ # "." # dim # "d";
+ string record = NVVM_INTRINSIC_RECORD<intr>.record;
+}
+
let TargetPrefix = "nvvm" in {
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4828,4 +4844,42 @@ def int_nvvm_setmaxnreg_dec_sync_aligned_u32
def int_nvvm_exit : ClangBuiltin<"__nvvm_exit">,
Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>;
+// -------- llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem
+class NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_INTR<int dim> :
+ DefaultAttrsIntrinsic<[],
+ !listconcat(
+ // flags, dst_smem_ptr, barrier_ptr, tensor_map_ptr
+ [llvm_i32_ty, llvm_shared_ptr_ty, llvm_shared_ptr_ty, llvm_ptr_ty],
+ !listsplat(llvm_i32_ty, dim), // tensor_dims
+ !if(!ge(dim, 3), !listsplat(llvm_i16_ty, !add(dim, -2)), []), // im2col
+ [llvm_i16_ty, llvm_i64_ty]), // cta_mask, cache_policy
+ [IntrConvergent, IntrArgMemOnly, ImmArg<ArgIndex<0>>,
+ WriteOnly<ArgIndex<1>>, ReadOnly<ArgIndex<3>>,
+ NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>,
+ NoCapture<ArgIndex<3>>],
+ NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_NAME<dim>.intr>;
+
+foreach dim = [1, 2, 3, 4, 5] in {
+ def NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_NAME<dim>.record :
+ NVVM_CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_INTR<dim>;
+}
+
+// -------- llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem
+class NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_INTR<int dim> :
+ DefaultAttrsIntrinsic<[],
+ !listconcat(
+ // flags, src_smem_ptr, tensor_map_ptr
+ [llvm_i32_ty, llvm_shared_ptr_ty, llvm_ptr_ty],
+ !listsplat(llvm_i32_ty, dim), // tensor_dims
+ [llvm_i64_ty]), // cache_policy
+ [IntrConvergent, IntrArgMemOnly, ImmArg<ArgIndex<0>>,
+ ReadOnly<ArgIndex<1>>, WriteOnly<ArgIndex<2>>,
+ NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>],
+ NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_NAME<dim>.intr>;
+
+foreach dim = [1, 2, 3, 4, 5] in {
+ def NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_NAME<dim>.record :
+ NVVM_CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_INTR<dim>;
+}
+
} // let TargetPrefix = "nvvm"
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicFlags.h b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
new file mode 100644
index 0000000000000..a8273b8de5adf
--- /dev/null
+++ b/llvm/include/llvm/IR/NVVMIntrinsicFlags.h
@@ -0,0 +1,40 @@
+//===--- NVVMIntrinsicFlags.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the definitions of the enumerations and flags
+/// associated with NVVM Intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_NVVMINTRINSICFLAGS_H
+#define LLVM_SUPPORT_NVVMINTRINSICFLAGS_H
+
+#include <stdint.h>
+
+namespace llvm {
+namespace nvvm {
+
+enum class CpAsyncBulkTensorLoadMode {
+ TILE = 0,
+ IM2COL = 1,
+};
+
+typedef union {
+ int V;
+ struct {
+ unsigned CacheHint : 1;
+ unsigned MultiCast : 1;
+ unsigned LoadMode : 3; // CpAsyncBulkTensorLoadMode
+ unsigned reserved : 27;
+ } U;
+} CpAsyncBulkTensorFlags;
+
+} // namespace nvvm
+} // namespace llvm
+#endif // LLVM_SUPPORT_NVVMINTRINSICFLAGS_H
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 371ec8596ef63..e6e2eb731ad5e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/NVVMIntrinsicFlags.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
@@ -160,6 +161,10 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
if (tryIntrinsicChain(N))
return;
break;
+ case ISD::INTRINSIC_VOID:
+ if (tryIntrinsicVoid(N))
+ return;
+ break;
case NVPTXISD::Tex1DFloatS32:
case NVPTXISD::Tex1DFloatFloat:
case NVPTXISD::Tex1DFloatFloatLevel:
@@ -4091,3 +4096,236 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
}
}
}
+
+static size_t GetCpAsyncBulkTensorDimFromIntrinsic(unsigned IID) {
+ switch (IID) {
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_1d:
+ return 1;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_2d:
+ return 2;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_3d:
+ return 3;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_4d:
+ return 4;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_5d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_5d:
+ return 5;
+ default:
+ llvm_unreachable(
+ "Invalid Tensor dim in nvvm_cp_async_bulk_tensor intrinsic");
+ }
+}
+
+#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \
+ (IsShared32 \
+ ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
+ : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \
+ (IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(SMEM_TO_GMEM, dim, mode, _CH)) \
+ : (CP_ASYNC_BULK_TENSOR_OPCODE(SMEM_TO_GMEM, dim, mode, )))
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \
+ [&]() -> auto { \
+ if (IsMultiCast && IsCacheHint) \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _MC_CH); \
+ if (IsCacheHint) \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _CH); \
+ if (IsMultiCast) \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _MC); \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, ); \
+ }()
+
+static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
+ bool IsCacheHint, bool IsIm2Col) {
+ if (IsIm2Col) {
+ switch (Dim) {
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL);
+ default:
+ llvm_unreachable("Invalid Dimension in im2col mode for "
+ "GetCpAsyncBulkTensorS2GOpcode.");
+ }
+ } else {
+ switch (Dim) {
+ case 1:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE);
+ case 2:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE);
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE);
+ default:
+ llvm_unreachable(
+ "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
+ }
+ }
+}
+
+static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
+ bool IsMultiCast,
+ bool IsCacheHint, bool IsIm2Col) {
+ if (IsIm2Col) {
+ switch (Dim) {
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL);
+ default:
+ llvm_unreachable("Invalid Dimension in im2col mode for "
+ "GetCpAsyncBulkTensorG2SOpcode.");
+ }
+ } else {
+ switch (Dim) {
+ case 1:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE);
+ case 2:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE);
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE);
+ default:
+ llvm_unreachable(
+ "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
+ }
+ }
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2G(SDNode *N) {
+ unsigned int SharedPointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED);
+ bool IsShared32 = (SharedPointerSize == 32);
+
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ size_t NumDims = GetCpAsyncBulkTensorDimFromIntrinsic(IID);
+
+ ConstantSDNode *FlagsNode = cast<ConstantSDNode>(N->getOperand(2));
+ nvvm::CpAsyncBulkTensorFlags Flags;
+ Flags.V = static_cast<unsigned>(FlagsNode->getZExtValue());
+ bool IsCacheHint = Flags.U.CacheHint == 1;
+ bool IsIm2Col = Flags.U.LoadMode == 1;
+
+ SDLoc DL(N);
+ // List of operands that are common to both variants
+ SmallVector<SDValue, 4> Ops{
+ N->getOperand(3), // Src pointer in smem
+ N->getOperand(4), // Dst tensor_map pointer in gmem
+ };
+
+ // Tensor Dims from [1-5] followed by the cache-hint operand
+ size_t TensorDimsStartIndex = 5;
+ size_t CacheHintIndex = TensorDimsStartIndex + NumDims;
+ for (size_t i = 0; i < NumDims; i++)
+ Ops.push_back(N->getOperand(TensorDimsStartIndex + i));
+
+ // Push the cache-hint operand, if available
+ if (IsCacheHint)
+ Ops.push_back(N->getOperand(CacheHintIndex));
+
+ // Finally, the chain operand
+ Ops.push_back(N->getOperand(0));
+
+ unsigned Opcode =
+ GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2S(SDNode *N) {
+ unsigned int SharedPointerSize =
+ CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED);
+ bool IsShared32 = (SharedPointerSize == 32);
+
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ size_t NumDims = GetCpAsyncBulkTensorDimFromIntrinsic(IID);
+
+ ConstantSDNode *FlagsNode = cast<ConstantSDNode>(N->getOperand(2));
+ nvvm::CpAsyncBulkTensorFlags Flags;
+ Flags.V = static_cast<unsigned>(FlagsNode->getZExtValue());
+ bool IsCacheHint = Flags.U.CacheHint == 1;
+ bool IsMultiCast = Flags.U.MultiCast == 1;
+ bool IsIm2Col = Flags.U.LoadMode == 1;
+
+ if (IsIm2Col && NumDims < 3)
+ report_fatal_error("NumDims should be at least 3 for Im2Col mode");
+
+ SDLoc DL(N);
+ // List of operands that are common to both tile and im2col variants
+ SmallVector<SDValue, 4> Ops{
+ N->getOperand(3), // Dst pointer in smem
+ N->getOperand(4), // Mbarrier pointer in smem
+ N->getOperand(5), // Src pointer (i.e. tensor_map) in gmem
+ };
+
+ // Tensor Dims from [1-5]
+ size_t TensorDimsStartIndex = 6;
+ for (size_t i = 0; i < NumDims; i++)
+ Ops.push_back(N->getOperand(TensorDimsStartIndex + i));
+
+ // Im2Col co-ordinates:
+ // These are always present in the input arguments for TensorDims{3,4,5}.
+ // Number of values is (NumDims - 2).
+ size_t Im2ColStartIndex = TensorDimsStartIndex + NumDims;
+ size_t NumDimsIm2Col = (NumDims > 2) ? (NumDims - 2) : 0;
+ size_t Im2ColEndIndex = Im2ColStartIndex + NumDimsIm2Col;
+ // ...However, passed down to the actual NVPTX only when
+ // this mode is enabled.
+ if (IsIm2Col) {
+ for (size_t i = 0; i < NumDimsIm2Col; i++)
+ Ops.push_back(N->getOperand(Im2ColStartIndex + i));
+ }
+
+ // Push MultiCast operand, if available
+ if (IsMultiCast)
+ Ops.push_back(N->getOperand(Im2ColEndIndex));
+
+ // Push CacheHint operand, if available
+ if (IsCacheHint)
+ Ops.push_back(N->getOperand(Im2ColEndIndex + 1));
+
+ // Finally, the chain operand
+ Ops.push_back(N->getOperand(0));
+
+ unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
+ NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
+
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
+bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
+ unsigned IID = N->getConstantOperandVal(1);
+ switch (IID) {
+ default:
+ return false;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_5d:
+ SelectCpAsyncBulkTensorS2G(N);
+ return true;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_5d:
+ SelectCpAsyncBulkTensorG2S(N);
+ return true;
+ }
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 49626d4051485..848cebd86a384 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -57,6 +57,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void Select(SDNode *N) override;
bool tryIntrinsicNoChain(SDNode *N);
bool tryIntrinsicChain(SDNode *N);
+ bool tryIntrinsicVoid(SDNode *N);
void SelectTexSurfHandle(SDNode *N);
bool tryLoad(SDNode *N);
bool tryLoadVector(SDNode *N);
@@ -76,6 +77,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
+ void SelectCpAsyncBulkTensorS2G(SDNode *N);
+ void SelectCpAsyncBulkTensorG2S(SDNode *N);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c81dfa68e4bd4..c38997ec7e78d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -433,6 +433,127 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
[(int_nvvm_cp_async_bulk_wait_group_read (i32 timm:$n))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
+//-----------------------------------
+// TMA Async Tensor Copy Functions
+//-----------------------------------
+
+// From Shared to Global memory
+class S2G_STRINGS<int dim, string mode, bit ch> {
+ string prefix = "cp.async.bulk.tensor";
+ string dir = "global.shared::cta";
+ string completion = "bulk_group";
+ string name = prefix
+ # "." # dim # "d"
+ # "." # dir
+ # "." # mode
+ # "." # completion
+ # !if(ch, ".L2::cache_hint", "");
+}
+
+multiclass CP_ASYNC_BULK_TENSOR_S2G<int dim, NVPTXRegClass rc, string mode = "tile"> {
+ defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
+ defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+ defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]";
+
+ def "": NVPTXInst<(outs),
+ !con((ins rc:$src, Int64Regs:$tmap), dims_dag),
+ !strconcat(S2G_STRINGS<dim, mode, 0>.name, asm_str, ";"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ def _CH: NVPTXInst<(outs),
+ !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch)),
+ !strconcat(S2G_STRINGS<dim, mode, 1>.name, asm_str, ", $ch;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+}
+// Tile mode
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D_TILE : CP_ASYNC_BULK_TENSOR_S2G<1, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D_TILE : CP_ASYNC_BULK_TENSOR_S2G<2, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_TILE : CP_ASYNC_BULK_TENSOR_S2G<3, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_4D_TILE : CP_ASYNC_BULK_TENSOR_S2G<4, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_5D_TILE : CP_ASYNC_BULK_TENSOR_S2G<5, Int64Regs>;
+
+// Tile mode with 32-bit shared-memory pointers
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_1D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_S2G<1, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_2D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_S2G<2, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_S2G<3, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_4D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_S2G<4, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_5D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_S2G<5, Int32Regs>;
+
+// Im2Col mode for 3/4/5d variants
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_IM2COL : CP_ASYNC_BULK_TENSOR_S2G<3, Int64Regs, "im2col_no_offs">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_4D_IM2COL : CP_ASYNC_BULK_TENSOR_S2G<4, Int64Regs, "im2col_no_offs">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_5D_IM2COL : CP_ASYNC_BULK_TENSOR_S2G<5, Int64Regs, "im2col_no_offs">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_3D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_S2G<3, Int32Regs, "im2col_no_offs">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_4D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_S2G<4, Int32Regs, "im2col_no_offs">;
+defm CP_ASYNC_BULK_TENSOR_SMEM_TO_GMEM_5D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_S2G<5, Int32Regs, "im2col_no_offs">;
+
+// From Global to Shared memory
+class G2S_STRINGS<int dim, string mode, bit mc, bit ch> {
+ string prefix = "cp.async.bulk.tensor";
+ string dir = "shared::cluster.global";
+ string completion = "mbarrier::complete_tx::bytes";
+ string name = prefix
+ # "." # dim # "d"
+ # "." # dir
+ # "." # mode
+ # "." # completion
+ # !if(mc, ".multicast::cluster", "")
+ # !if(ch, ".L2::cache_hint", "");
+}
+
+multiclass CP_ASYNC_BULK_TENSOR_G2S<int dim, NVPTXRegClass rc, string mode = "tile"> {
+ defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
+ defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
+ defvar asm_str_default = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
+
+ defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
+ defvar im2col_dag = !if(!eq(mode, "im2col"),
+ !dag(ins, !listsplat(Int16Regs, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
+ (ins));
+ defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
+ defvar im2col_asm_str = ", {{" # im2col_str # "}}";
+
+ defvar asm_str = !if(!eq(mode, "im2col"),
+ !strconcat(asm_str_default, im2col_asm_str), asm_str_default);
+
+ def "": NVPTXInst<(outs),
+ !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag),
+ !strconcat(G2S_STRINGS<dim, mode, 0, 0>.name, asm_str, ";"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ def _MC: NVPTXInst<(outs),
+ !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)),
+ !strconcat(G2S_STRINGS<dim, mode, 1, 0>.name, asm_str, ", $mc;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ def _CH: NVPTXInst<(outs),
+ !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
+ !strconcat(G2S_STRINGS<dim, mode, 0, 1>.name, asm_str, ", $ch;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+ def _MC_CH: NVPTXInst<(outs),
+ !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)),
+ !strconcat(G2S_STRINGS<dim, mode, 1, 1>.name, asm_str, ", $mc, $ch;"), []>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+}
+// Tile mode
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_1D_TILE : CP_ASYNC_BULK_TENSOR_G2S<1, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_2D_TILE : CP_ASYNC_BULK_TENSOR_G2S<2, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_3D_TILE : CP_ASYNC_BULK_TENSOR_G2S<3, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_4D_TILE : CP_ASYNC_BULK_TENSOR_G2S<4, Int64Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_5D_TILE : CP_ASYNC_BULK_TENSOR_G2S<5, Int64Regs>;
+
+// Tile mode with 32-bit shared-memory pointers
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_1D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_G2S<1, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_2D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_G2S<2, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_3D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_G2S<3, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_4D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_G2S<4, Int32Regs>;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_5D_SHARED32_TILE : CP_ASYNC_BULK_TENSOR_G2S<5, Int32Regs>;
+
+// Im2Col mode for 3/4/5d variants
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_3D_IM2COL : CP_ASYNC_BULK_TENSOR_G2S<3, Int64Regs, "im2col">;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_4D_IM2COL : CP_ASYNC_BULK_TENSOR_G2S<4, Int64Regs, "im2col">;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_5D_IM2COL : CP_ASYNC_BULK_TENSOR_G2S<5, Int64Regs, "im2col">;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_3D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_G2S<3, Int32Regs, "im2col">;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_4D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_G2S<4, Int32Regs, "im2col">;
+defm CP_ASYNC_BULK_TENSOR_GMEM_TO_SMEM_5D_SHARED32_IM2COL : CP_ASYNC_BULK_TENSOR_G2S<5, Int32Regs, "im2col">;
+
//-----------------------------------
// MBarrier Functions
//-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
new file mode 100644
index 0000000000000..caa8d35df6d06
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll
@@ -0,0 +1,431 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march nvptx64 --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK_PTX_SHARED32 %s
+; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.1d(i32 %flags, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.2d(i32 %flags, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 %flags, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 %flags, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 %flags, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch);
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_1d
+define void @cp_async_bulk_tensor_g2s_1d(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_1d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_1d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_1d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_1d_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_1d_param_4];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_1d_param_6];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_1d_param_5];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_1d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<4>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_1d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_1d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_1d_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_1d_param_4];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_1d_param_6];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_1d_param_5];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.1d(i32 0, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.1d(i32 1, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.1d(i32 2, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.1d(i32 3, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_2d
+define void @cp_async_bulk_tensor_g2s_2d(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_2d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<3>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_2d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_2d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_2d_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_2d_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_2d_param_5];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_2d_param_7];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_2d_param_6];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_2d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<5>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_2d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_2d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_2d_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_2d_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_2d_param_5];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_2d_param_7];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_2d_param_6];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.2d(i32 0, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.2d(i32 1, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.2d(i32 2, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.2d(i32 3, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_3d_tile
+define void @cp_async_bulk_tensor_g2s_3d_tile(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_3d_tile(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<4>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_3d_tile_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_3d_tile_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_3d_tile_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_3d_tile_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_3d_tile_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_3d_tile_param_6];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_3d_tile_param_8];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_3d_tile_param_7];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_3d_tile(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<6>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_3d_tile_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_3d_tile_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_3d_tile_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_3d_tile_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_3d_tile_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_3d_tile_param_6];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_3d_tile_param_8];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_3d_tile_param_7];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 0, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 1, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 2, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 3, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_3d_im2col
+define void @cp_async_bulk_tensor_g2s_3d_im2col(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_3d_im2col(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<3>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<4>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_3d_im2col_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_3d_im2col_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_3d_im2col_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_3d_im2col_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_3d_im2col_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_3d_im2col_param_6];
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_3d_im2col_param_7];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_3d_im2col_param_9];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_3d_im2col_param_8];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_3d_im2col(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<3>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<6>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_3d_im2col_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_3d_im2col_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_3d_im2col_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_3d_im2col_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_3d_im2col_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_3d_im2col_param_6];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_3d_im2col_param_7];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_3d_im2col_param_9];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_3d_im2col_param_8];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 4, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 5, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 6, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.3d(i32 7, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_4d_tile
+define void @cp_async_bulk_tensor_g2s_4d_tile(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_4d_tile(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<5>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_4d_tile_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_4d_tile_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_4d_tile_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_4d_tile_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_4d_tile_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_4d_tile_param_6];
+; CHECK_PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_4d_tile_param_7];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_4d_tile_param_9];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_4d_tile_param_8];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_4d_tile(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<7>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_4d_tile_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_4d_tile_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_4d_tile_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_4d_tile_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_4d_tile_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_4d_tile_param_6];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_4d_tile_param_7];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_4d_tile_param_9];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_4d_tile_param_8];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 0, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i16 undef, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 1, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i16 undef, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 2, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i16 undef, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 3, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i16 undef, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_4d_im2col
+define void @cp_async_bulk_tensor_g2s_4d_im2col(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_4d_im2col(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<4>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<5>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_4d_im2col_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_4d_im2col_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_4d_im2col_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_4d_im2col_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_4d_im2col_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_4d_im2col_param_6];
+; CHECK_PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_4d_im2col_param_7];
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_4d_im2col_param_8];
+; CHECK_PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_4d_im2col_param_9];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_4d_im2col_param_11];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_4d_im2col_param_10];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_4d_im2col(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<4>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<7>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_4d_im2col_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_4d_im2col_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_4d_im2col_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_4d_im2col_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_4d_im2col_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_4d_im2col_param_6];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_4d_im2col_param_7];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_4d_im2col_param_8];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_4d_im2col_param_9];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_4d_im2col_param_11];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_4d_im2col_param_10];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 4, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 5, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 6, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.4d(i32 7, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_5d_tile
+define void @cp_async_bulk_tensor_g2s_5d_tile(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_5d_tile(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<6>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_5d_tile_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_5d_tile_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_5d_tile_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_5d_tile_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_5d_tile_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_5d_tile_param_6];
+; CHECK_PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_5d_tile_param_7];
+; CHECK_PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_5d_tile_param_8];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_5d_tile_param_10];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_5d_tile_param_9];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_5d_tile(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<2>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<8>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_5d_tile_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_5d_tile_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_5d_tile_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_5d_tile_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_5d_tile_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_5d_tile_param_6];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_5d_tile_param_7];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_5d_tile_param_8];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_5d_tile_param_10];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_5d_tile_param_9];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 0, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i16 undef, i16 undef, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 1, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i16 undef, i16 undef, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 2, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i16 undef, i16 undef, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 3, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i16 undef, i16 undef, i16 %mc, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_g2s_5d_im2col
+define void @cp_async_bulk_tensor_g2s_5d_im2col(i32 %flag, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_g2s_5d_im2col(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b16 %rs<5>;
+; CHECK_PTX64-NEXT: .reg .b32 %r<6>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<5>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_5d_im2col_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_5d_im2col_param_2];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_5d_im2col_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_5d_im2col_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_5d_im2col_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_5d_im2col_param_6];
+; CHECK_PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_5d_im2col_param_7];
+; CHECK_PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_5d_im2col_param_8];
+; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_5d_im2col_param_9];
+; CHECK_PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_5d_im2col_param_10];
+; CHECK_PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_5d_im2col_param_11];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
+; CHECK_PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_5d_im2col_param_13];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
+; CHECK_PTX64-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_5d_im2col_param_12];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_g2s_5d_im2col(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b16 %rs<5>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<8>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_5d_im2col_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_5d_im2col_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_5d_im2col_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_5d_im2col_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_5d_im2col_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_5d_im2col_param_6];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_5d_im2col_param_7];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_5d_im2col_param_8];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_5d_im2col_param_9];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_5d_im2col_param_10];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_5d_im2col_param_11];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_5d_im2col_param_13];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_5d_im2col_param_12];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 4, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 5, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 6, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch)
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.gmem.to.smem.5d(i32 7, ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch)
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
new file mode 100644
index 0000000000000..a8d48e23b7741
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK_PTX_SHARED32 %s
+; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
+; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+declare void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.1d(i32 %flags, ptr addrspace(3) %s, ptr %tm, i32 %d0, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.2d(i32 %flags, ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.3d(i32 %flags, ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.4d(i32 %flags, ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch);
+declare void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.5d(i32 %flags, ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch);
+
+; CHECK-LABEL: cp_async_bulk_tensor_s2g_1d
+define void @cp_async_bulk_tensor_s2g_1d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_s2g_1d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_1d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_1d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_1d_param_3];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_1d_param_4];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_s2g_1d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<3>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_1d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_1d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_1d_param_3];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_1d_param_4];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.1d(i32 0, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.1d(i32 1, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_s2g_2d
+define void @cp_async_bulk_tensor_s2g_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_s2g_2d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b32 %r<3>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_2d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_2d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_2d_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_2d_param_4];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_2d_param_5];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_s2g_2d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<4>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_2d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_2d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_2d_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_2d_param_4];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_2d_param_5];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.2d(i32 0, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.2d(i32 1, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_s2g_3d
+define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_s2g_3d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b32 %r<4>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_s2g_3d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<5>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.3d(i32 0, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.3d(i32 1, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.3d(i32 4, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.3d(i32 5, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_s2g_4d
+define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_s2g_4d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b32 %r<5>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_s2g_4d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<6>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.4d(i32 0, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.4d(i32 1, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.4d(i32 4, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.4d(i32 5, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch)
+ ret void
+}
+
+; CHECK-LABEL: cp_async_bulk_tensor_s2g_5d
+define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) {
+; CHECK_PTX64-LABEL: cp_async_bulk_tensor_s2g_5d(
+; CHECK_PTX64: {
+; CHECK_PTX64-NEXT: .reg .b32 %r<6>;
+; CHECK_PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK_PTX64-EMPTY:
+; CHECK_PTX64-NEXT: // %bb.0:
+; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2];
+; CHECK_PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3];
+; CHECK_PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4];
+; CHECK_PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5];
+; CHECK_PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6];
+; CHECK_PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
+; CHECK_PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
+; CHECK_PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
+; CHECK_PTX64-NEXT: ret;
+;
+; CHECK_PTX_SHARED32-LABEL: cp_async_bulk_tensor_s2g_5d(
+; CHECK_PTX_SHARED32: {
+; CHECK_PTX_SHARED32-NEXT: .reg .b32 %r<7>;
+; CHECK_PTX_SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK_PTX_SHARED32-EMPTY:
+; CHECK_PTX_SHARED32-NEXT: // %bb.0:
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
+; CHECK_PTX_SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
+; CHECK_PTX_SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.5d(i32 0, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.5d(i32 1, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.5d(i32 4, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef)
+
+ tail call void @llvm.nvvm.cp.async.bulk.tensor.smem.to.gmem.5d(i32 5, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch)
+ ret void
+}
More information about the llvm-commits
mailing list