[Mlir-commits] [mlir] [MLIR][NVVM] Add support tcgen05.{ld, st} (PR #130728)
Pradeep Kumar
llvmlistbot at llvm.org
Tue Mar 11 00:00:22 PDT 2025
https://github.com/schwarzschild-radius created https://github.com/llvm/llvm-project/pull/130728
This commit adds support tcgen05.{ld, st} to the NVVM Dialect with tests under tcgen05-ld.mlir and tcgen05-st.mlir respectively
>From fd4a524e2ddede48993a422177be01e5a9cdca1c Mon Sep 17 00:00:00 2001
From: pradeepku <pradeepku at nvidia.com>
Date: Fri, 7 Mar 2025 19:18:25 +0530
Subject: [PATCH] [MLIR][NVVM] Add support tcgen05.{ld, st}
This commit adds support tcgen05.{ld, st} to the NVVM Dialect with tests
under tcgen05-ld.mlir and tcgen05-st.mlir respectively
---
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 202 ++++++++++
mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp | 46 +++
.../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 106 +++++
mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir | 287 +++++++++++++
mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir | 377 ++++++++++++++++++
5 files changed, 1018 insertions(+)
create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir
create mode 100644 mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 944cb481b025b..ff6696f6bec40 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -2929,6 +2929,208 @@ def NVVM_Tcgen05CpOp : NVVM_Op<"tcgen05.cp"> {
}];
}
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05 LdSt Shape Attr
+//===----------------------------------------------------------------------===//
+
+def Tcgen05LdStShape16x64b: I32EnumAttrCase<"SHAPE_16X64B", 0, "shape_16x64b">;
+def Tcgen05LdStShape16x128b: I32EnumAttrCase<"SHAPE_16X128B", 1, "shape_16x128b">;
+def Tcgen05LdStShape16x256b: I32EnumAttrCase<"SHAPE_16X256B", 2, "shape_16x256b">;
+def Tcgen05LdStShape32x32b: I32EnumAttrCase<"SHAPE_32X32B", 3, "shape_32x32b">;
+def Tcgen05LdStShape16x32bx2: I32EnumAttrCase<"SHAPE_16X32BX2", 4, "shape_16x32bx2">;
+
+def Tcgen05LdStShape: I32EnumAttr<
+ "Tcgen05LdStShape",
+ "",
+ [Tcgen05LdStShape16x64b, Tcgen05LdStShape16x128b, Tcgen05LdStShape16x256b,
+ Tcgen05LdStShape32x32b, Tcgen05LdStShape16x32bx2]
+> {
+ let cppNamespace = "::mlir::NVVM";
+ let genSpecializedAttr = 0;
+}
+
+def Tcgen05LdStShapeAttr: EnumAttr<NVVM_Dialect, Tcgen05LdStShape, "tcgen05_ldst_shape"> {
+ let assemblyFormat = "`<` $value `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.ld Op
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05LdOp : NVVM_Op<"tcgen05.ld"> {
+ let summary = "tensor memory load instructions";
+ let arguments = (ins
+ // Attributes
+ UnitAttr:$pack,
+ Tcgen05LdStShapeAttr:$shape,
+ // Arguments
+ LLVM_PointerTensor:$tmemAddr,
+ Optional<I64>:$offset
+ );
+
+ let results = (outs AnyTypeOf<[I32, VectorOfLengthAndType<
+ [2, 4, 8, 16, 32, 64, 128], [I32]>]>:$res);
+
+ let assemblyFormat = [{
+ $tmemAddr (`,` $offset^)? (`pack` $pack^)? attr-dict `:` type($res)
+ }];
+
+ let description = [{
+ Instruction `tcgen05.ld` asynchronously loads data from the Tensor Memory at
+ the location specified by the 32-bit address operand `tmemAddr` into the
+ destination register `res`, collectively across all threads of the warps.
+
+ The `shape` and the `num` attribute together determines the total
+ dimension of the data which is loaded from the Tensor Memory. The `shape`
+ attribute indicates the base dimension of data to be accessed as described
+ in the Data Movement Shape. The `num` attribute indicates the repeat
+ factor on the base dimension resulting in the total dimension of the data
+ that is accessed.
+
+ The shape `16x32bx2` performs two accesses into Tensor Memory of the shape
+ `16x32b`. The base address of the first access is specified by `tmemAddr`
+ and the base address of the second access is specified by
+ `tmemAddr + offset`, where `offset` is an immediate argument.
+
+ The unit attribute `pack` can be used to pack two 16-bit
+ elements from adjacent columns into a single 32-bit element during the load.
+
+ The following table describes the size of the vector for various combinations
+ of `num` and `shape` attributes
+ |=====================================================================|
+ | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b |
+ |=====================================================================|
+ | x1 | 1 | 2 | 4 |
+ | x2 | 2 | 4 | 8 |
+ | x4 | 4 | 8 | 16 |
+ | x8 | 8 | 16 | 32 |
+ | x16 | 16 | 32 | 64 |
+ | x32 | 32 | 64 | 128 |
+ | x64 | 64 | 128 | NA |
+ | x128 | 128 | NA | NA |
+ |=====================================================================|
+
+ Example:
+ ```mlir
+ nvvm.tcgen05.ld %tmemAddr, %offset pack {
+ shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
+ } : <2xi32>
+ ```
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st)
+ }];
+
+ let hasVerifier = 1;
+
+ string llvmBuilder = [{
+ llvm::LLVMContext &Context = moduleTranslation.getLLVMContext();
+ auto Pack = llvm::ConstantInt::get(Context, llvm::APInt(1, $pack));
+
+ unsigned num = $_resultType->isVectorTy()
+ ? llvm::cast<llvm::VectorType>($_resultType)
+ ->getElementCount()
+ .getFixedValue()
+ : 1;
+
+ auto ID = getTcgen05LdIntrinsicID($shape, num);
+ if (ID == llvm::Intrinsic::not_intrinsic)
+ llvm::report_fatal_error("unknow intrinsic signature for tcgen05.ld");
+
+ if ($offset)
+ $res = createIntrinsicCall(builder, ID, {$tmemAddr, $offset, Pack});
+ else
+ $res = createIntrinsicCall(builder, ID, {$tmemAddr, Pack});
+ }];
+}
+
+//===----------------------------------------------------------------------===//
+// NVVM tcgen05.st Op
+//===----------------------------------------------------------------------===//
+
+def NVVM_Tcgen05StOp : NVVM_Op<"tcgen05.st"> {
+ let summary = "tensor memory store instructions";
+ let arguments = (ins
+ // Attributes
+ UnitAttr:$unpack,
+ Tcgen05LdStShapeAttr:$shape,
+ // Arguments
+ LLVM_PointerTensor:$tmemAddr,
+ AnyTypeOf<[I32, VectorOfLengthAndType<
+ [2, 4, 8, 16, 32, 64, 128], [I32]>]>:$val,
+ Optional<I64>:$offset
+ );
+
+ let assemblyFormat = [{
+ $tmemAddr `,` $val (`,` $offset^)? (`unpack` $unpack^)? attr-dict `:` type($val)
+ }];
+
+ let description = [{
+ Instruction `tcgen05.st` asynchronously stores data from the source register `r`
+ into the Tensor Memory at the location specified by the 32-bit address operand
+ `tmemAddr`, collectively across all threads of the warps.
+
+ The `shape` and the `num` attribute together determines the total dimension of
+ the data which is stored to the Tensor Memory. The `shape` indicates the base
+ dimension of data to be accessed. The `num` attribute indicates the repeat
+ factor on the base dimension resulting in the total dimension of the data that
+ is accessed.
+
+ The shape `16x32bx2` performs two accesses into Tensor Memory of the shape
+ `16x32b`. The base address of the first access is specified by `tmemAddr`
+ and the base address of the second access is specified by
+ `tmemAddr + offset`, where `offset` is an immediate argument.
+
+ The unit attribute `unpack` can be used to unpack a 32-bit element
+ in the register into two 16-bit elements and store them in adjacent columns.
+
+ The following table describes the size of the vector for various combinations
+ of `num` and `shape` attributes
+ |=====================================================================|
+ | num/shape | 16x32bx2/16x64b/32x32b | 16x128b | 16x256b |
+ |=====================================================================|
+ | x1 | 1 | 2 | 4 |
+ | x2 | 2 | 4 | 8 |
+ | x4 | 4 | 8 | 16 |
+ | x8 | 8 | 16 | 32 |
+ | x16 | 16 | 32 | 64 |
+ | x32 | 32 | 64 | 128 |
+ | x64 | 64 | 128 | NA |
+ | x128 | 128 | NA | NA |
+ |=====================================================================|
+
+ Example:
+ ```mlir
+ nvvm.tcgen05.st %tmemAddr, %val, %offset unpack {
+ shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>,
+ } : <2xi32>
+ ```
+
+ [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st)
+ }];
+
+ string llvmBuilder = [{
+ llvm::LLVMContext &Context = moduleTranslation.getLLVMContext();
+ auto Unpack = llvm::ConstantInt::get(Context, llvm::APInt(1, $unpack));
+
+ auto valTy = $val->getType();
+ uint32_t num = valTy->isVectorTy() ? llvm::cast<llvm::VectorType>(valTy)
+ ->getElementCount()
+ .getFixedValue()
+ : 1;
+
+ auto ID = getTcgen05StIntrinsicID($shape, num);
+ if (ID == llvm::Intrinsic::not_intrinsic)
+ llvm::report_fatal_error("unknow intrinsic signature for tcgen05.st");
+
+ if ($offset)
+ createIntrinsicCall(builder, ID, {$tmemAddr, $offset, $val, Unpack});
+ else
+ createIntrinsicCall(builder, ID, {$tmemAddr, $val, Unpack});
+ }];
+
+ let hasVerifier = 1;
+}
+
//===----------------------------------------------------------------------===//
// NVVM target attribute.
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 45a0f9dbd4a7c..1d7b979a5cc90 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
@@ -1387,6 +1388,51 @@ llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) {
llvm_unreachable("Invalid shape in tcgen05 cp Op");
}
+// Returns the valid vector length for a given shape and vector length, the
+// function models the table mentioned in the tcgen05.{ld, st} Op description
+static unsigned isValidVectorLength(NVVM::Tcgen05LdStShape Shape,
+ unsigned VecLen) {
+ if (Shape == NVVM::Tcgen05LdStShape::SHAPE_16X128B)
+ return VecLen >= 2;
+ if (Shape == NVVM::Tcgen05LdStShape::SHAPE_16X256B)
+ return VecLen >= 4;
+ return true;
+}
+
+LogicalResult Tcgen05LdOp::verify() {
+ LogicalResult Result = success();
+ if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset())
+ Result = emitError("shape 16x32bx2 requires offset argument");
+
+ auto ResTy = getRes().getType();
+ unsigned ResLen = isa<VectorType>(ResTy)
+ ? llvm::cast<VectorType>(ResTy).getNumElements()
+ : 1;
+ if (!isValidVectorLength(getShape(), ResLen))
+ Result = emitError(llvm::formatv("invalid result type length {0} for shape "
+ "{1} in tcgen05.ld Op",
+ ResLen, stringifyEnum(getShape())));
+
+ return Result;
+}
+
+LogicalResult Tcgen05StOp::verify() {
+ LogicalResult Result = success();
+ if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset())
+ Result = emitError("shape 16x32bx2 requires offset argument");
+
+ auto ValTy = getVal().getType();
+ unsigned ValLen = isa<VectorType>(ValTy)
+ ? llvm::cast<VectorType>(ValTy).getNumElements()
+ : 1;
+ if (!isValidVectorLength(getShape(), ValLen))
+ Result = emitError(llvm::formatv("invalid input length {0} for shape "
+ "{1} in tcgen05.st Op",
+ ValLen, stringifyEnum(getShape())));
+
+ return Result;
+}
+
/// Infer the result ranges for the NVVM SpecialRangeableRegisterOp that might
/// have ConstantRangeAttr.
static void nvvmInferResultRanges(Operation *op, Value result,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 9540762de2777..c3a129a82688f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -170,6 +170,112 @@ static unsigned getUnidirectionalFenceProxyID(NVVM::ProxyKind fromProxy,
llvm_unreachable("Unsupported proxy kinds");
}
+#define TCGEN05LD(SHAPE, NUM) llvm::Intrinsic::nvvm_tcgen05_ld_##SHAPE##_##NUM
+
+static llvm::Intrinsic::ID
+getTcgen05LdIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) {
+ llvm::Intrinsic::ID Shape16x64b[] = {
+ TCGEN05LD(16x64b, x1), TCGEN05LD(16x64b, x2), TCGEN05LD(16x64b, x4),
+ TCGEN05LD(16x64b, x8), TCGEN05LD(16x64b, x16), TCGEN05LD(16x64b, x32),
+ TCGEN05LD(16x64b, x64), TCGEN05LD(16x64b, x128),
+ };
+
+ llvm::Intrinsic::ID Shape16x128b[] = {
+ TCGEN05LD(16x128b, x1), TCGEN05LD(16x128b, x2), TCGEN05LD(16x128b, x4),
+ TCGEN05LD(16x128b, x8), TCGEN05LD(16x128b, x16), TCGEN05LD(16x128b, x32),
+ TCGEN05LD(16x128b, x64),
+ };
+
+ llvm::Intrinsic::ID Shape16x256b[] = {
+ TCGEN05LD(16x256b, x1), TCGEN05LD(16x256b, x2), TCGEN05LD(16x256b, x4),
+ TCGEN05LD(16x256b, x8), TCGEN05LD(16x256b, x16), TCGEN05LD(16x256b, x32),
+ };
+
+ llvm::Intrinsic::ID Shape16x32bx2[] = {
+ TCGEN05LD(16x32bx2, x1), TCGEN05LD(16x32bx2, x2),
+ TCGEN05LD(16x32bx2, x4), TCGEN05LD(16x32bx2, x8),
+ TCGEN05LD(16x32bx2, x16), TCGEN05LD(16x32bx2, x32),
+ TCGEN05LD(16x32bx2, x64), TCGEN05LD(16x32bx2, x128),
+ };
+
+ llvm::Intrinsic::ID Shape32x32b[] = {
+ TCGEN05LD(32x32b, x1), TCGEN05LD(32x32b, x2), TCGEN05LD(32x32b, x4),
+ TCGEN05LD(32x32b, x8), TCGEN05LD(32x32b, x16), TCGEN05LD(32x32b, x32),
+ TCGEN05LD(32x32b, x64), TCGEN05LD(32x32b, x128),
+ };
+
+ // `num` contains the length of vector and log2 of `num` returns the index
+ // into the shape array
+ unsigned Idx = std::log2(num);
+
+ switch (shape) {
+ case NVVM::Tcgen05LdStShape::SHAPE_16X64B:
+ return Shape16x64b[Idx];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X128B:
+ return Shape16x128b[Idx - 1];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X256B:
+ return Shape16x256b[Idx - 2];
+ case NVVM::Tcgen05LdStShape::SHAPE_32X32B:
+ return Shape32x32b[Idx];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X32BX2:
+ return Shape16x32bx2[Idx];
+ }
+ llvm_unreachable("unhandled tcgen05.ld lowering");
+}
+
+#define TCGEN05ST(SHAPE, NUM) llvm::Intrinsic::nvvm_tcgen05_st_##SHAPE##_##NUM
+
+static llvm::Intrinsic::ID
+getTcgen05StIntrinsicID(mlir::NVVM::Tcgen05LdStShape shape, uint32_t num) {
+ llvm::Intrinsic::ID Shape16x64b[] = {
+ TCGEN05ST(16x64b, x1), TCGEN05ST(16x64b, x2), TCGEN05ST(16x64b, x4),
+ TCGEN05ST(16x64b, x8), TCGEN05ST(16x64b, x16), TCGEN05ST(16x64b, x32),
+ TCGEN05ST(16x64b, x64), TCGEN05ST(16x64b, x128),
+ };
+
+ llvm::Intrinsic::ID Shape16x128b[] = {
+ TCGEN05ST(16x128b, x1), TCGEN05ST(16x128b, x2), TCGEN05ST(16x128b, x4),
+ TCGEN05ST(16x128b, x8), TCGEN05ST(16x128b, x16), TCGEN05ST(16x128b, x32),
+ TCGEN05ST(16x128b, x64),
+ };
+
+ llvm::Intrinsic::ID Shape16x256b[] = {
+ TCGEN05ST(16x256b, x1), TCGEN05ST(16x256b, x2), TCGEN05ST(16x256b, x4),
+ TCGEN05ST(16x256b, x8), TCGEN05ST(16x256b, x16), TCGEN05ST(16x256b, x32),
+ };
+
+ llvm::Intrinsic::ID Shape16x32bx2[] = {
+ TCGEN05ST(16x32bx2, x1), TCGEN05ST(16x32bx2, x2),
+ TCGEN05ST(16x32bx2, x4), TCGEN05ST(16x32bx2, x8),
+ TCGEN05ST(16x32bx2, x16), TCGEN05ST(16x32bx2, x32),
+ TCGEN05ST(16x32bx2, x64), TCGEN05ST(16x32bx2, x128),
+ };
+
+ llvm::Intrinsic::ID Shape32x32b[] = {
+ TCGEN05ST(32x32b, x1), TCGEN05ST(32x32b, x2), TCGEN05ST(32x32b, x4),
+ TCGEN05ST(32x32b, x8), TCGEN05ST(32x32b, x16), TCGEN05ST(32x32b, x32),
+ TCGEN05ST(32x32b, x64), TCGEN05ST(32x32b, x128),
+ };
+
+ // `num` contains the length of vector and log2 of `num` returns the index
+ // into the shape array
+ unsigned Idx = std::log2(num);
+
+ switch (shape) {
+ case NVVM::Tcgen05LdStShape::SHAPE_16X64B:
+ return Shape16x64b[Idx];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X128B:
+ return Shape16x128b[Idx - 1];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X256B:
+ return Shape16x256b[Idx - 2];
+ case NVVM::Tcgen05LdStShape::SHAPE_32X32B:
+ return Shape32x32b[Idx];
+ case NVVM::Tcgen05LdStShape::SHAPE_16X32BX2:
+ return Shape16x32bx2[Idx];
+ }
+ llvm_unreachable("unhandled tcgen05.st lowering");
+}
+
namespace {
/// Implementation of the dialect interface that converts operations belonging
/// to the NVVM dialect to LLVM IR.
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir
new file mode 100644
index 0000000000000..b1266b0e8151d
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld.mlir
@@ -0,0 +1,287 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b
+llvm.func @nvvm_tcgen05_ld_16x64b(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv1 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : i32
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b_pack
+llvm.func @nvvm_tcgen05_ld_16x64b_pack(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv1 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : i32
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b
+llvm.func @nvvm_tcgen05_ld_16x128b(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b_pack
+llvm.func @nvvm_tcgen05_ld_16x128b_pack(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b
+llvm.func @nvvm_tcgen05_ld_16x256b(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b_pack
+llvm.func @nvvm_tcgen05_ld_16x256b_pack(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b
+llvm.func @nvvm_tcgen05_ld_32x32b(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call i32 @llvm.nvvm.tcgen05.ld.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv1 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : i32
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 false)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_pack
+llvm.func @nvvm_tcgen05_ld_32x32b_pack(%tmemAddr : !llvm.ptr<6>) {
+
+// CHECK: call i32 @llvm.nvvm.tcgen05.ld.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv1 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : i32
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, i1 true)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr pack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2
+llvm.func @nvvm_tcgen05_ld_16x32bx2(%tmemAddr : !llvm.ptr<6>) {
+
+ %halfSplitOffset = llvm.mlir.constant(2:i64) : i64
+
+// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv1 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : i32
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 false)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<128 x i32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_pack
+llvm.func @nvvm_tcgen05_ld_16x32bx2_pack(%tmemAddr : !llvm.ptr<6>) {
+
+ %halfSplitOffset = llvm.mlir.constant(2:i64) : i64
+
+// CHECK: call i32 @llvm.nvvm.tcgen05.ld.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv1 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : i32
+
+// CHECK: call <2 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv2 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<2 x i32>
+
+// CHECK: call <4 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv4 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<4 x i32>
+
+// CHECK: call <8 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv8 = nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<8 x i32>
+
+// CHECK: call <16 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv16= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<16 x i32>
+
+// CHECK: call <32 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv32= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<32 x i32>
+
+// CHECK: call <64 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv64= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<64 x i32>
+
+// CHECK: call <128 x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, i1 true)
+ %ldv128= nvvm.tcgen05.ld %tmemAddr, %halfSplitOffset pack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>} : vector<128 x i32>
+
+ llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir
new file mode 100644
index 0000000000000..119746133625d
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-st.mlir
@@ -0,0 +1,377 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b
+llvm.func @nvvm_tcgen05_ld_16x64b(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv1 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=1:i32 } : i32
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv2 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=2:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=4:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=8:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=16:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=32:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=64:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=128:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x64b_pack
+llvm.func @nvvm_tcgen05_ld_16x64b_pack(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv1 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=1:i32 } : i32
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv2 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=2:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=4:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=8:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=16:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=32:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=64:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x64b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x64b>, num=128:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b
+llvm.func @nvvm_tcgen05_ld_16x128b(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv2 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=1:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=2:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=4:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=8:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=16:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=32:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=64:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x128b_pack
+llvm.func @nvvm_tcgen05_ld_16x128b_pack(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv2 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=1:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x2(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=2:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x4(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=4:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x8(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=8:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x16(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=16:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x32(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=32:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x128b.x64(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x128b>, num=64:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b
+llvm.func @nvvm_tcgen05_ld_16x256b(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=1:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=2:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=4:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=8:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=16:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=32:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x256b_pack
+llvm.func @nvvm_tcgen05_ld_16x256b_pack(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=1:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x2(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=2:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x4(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=4:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x8(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=8:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x16(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=16:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x256b.x32(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x256b>, num=32:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b
+llvm.func @nvvm_tcgen05_ld_32x32b(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv1 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=1:i32 } : i32
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv2 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=2:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv4 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=4:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv8 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=8:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv16 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=16:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv32 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=32:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv64 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=64:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv128 { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=128:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_32x32b_pack
+llvm.func @nvvm_tcgen05_ld_32x32b_pack(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) {{%[0-9]+}}, i32 {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv1 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=1:i32 } : i32
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x2(ptr addrspace(6) {{%[0-9]+}}, <2 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv2 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=2:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x4(ptr addrspace(6) {{%[0-9]+}}, <4 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv4 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=4:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x8(ptr addrspace(6) {{%[0-9]+}}, <8 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv8 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=8:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x16(ptr addrspace(6) {{%[0-9]+}}, <16 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv16 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=16:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x32(ptr addrspace(6) {{%[0-9]+}}, <32 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv32 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=32:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x64(ptr addrspace(6) {{%[0-9]+}}, <64 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv64 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=64:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.32x32b.x128(ptr addrspace(6) {{%[0-9]+}}, <128 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv128 unpack { shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>, num=128:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2
+llvm.func @nvvm_tcgen05_ld_16x32bx2(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+ %offset = llvm.mlir.constant(2:i64) : i64
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i32 {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv1, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=1:i32 } : i32
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, <2 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv2, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=2:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, <4 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv4, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=4:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, <8 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv8, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=8:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, <16 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv16, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=16:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, <32 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv32, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=32:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, <64 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv64, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=64:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, <128 x i32> {{%[0-9]+}}, i1 false)
+ nvvm.tcgen05.st %tmemAddr, %stv128, %offset { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=128:i32 } : vector<128xi32>
+
+ llvm.return
+}
+
+// CHECK-LABEL: @nvvm_tcgen05_ld_16x32bx2_pack
+llvm.func @nvvm_tcgen05_ld_16x32bx2_pack(
+ %tmemAddr : !llvm.ptr<6>,
+ %stv1 : i32,
+ %stv2 : vector<2xi32>,
+ %stv4 : vector<4xi32>,
+ %stv8 : vector<8xi32>,
+ %stv16 : vector<16xi32>,
+ %stv32 : vector<32xi32>,
+ %stv64 : vector<64xi32>,
+ %stv128 : vector<128xi32>) {
+
+ %offset = llvm.mlir.constant(2:i64) : i64
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) {{%[0-9]+}}, i64 2, i32 {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv1, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=1:i32 } : i32
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x2(ptr addrspace(6) {{%[0-9]+}}, i64 2, <2 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv2, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=2:i32 } : vector<2xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x4(ptr addrspace(6) {{%[0-9]+}}, i64 2, <4 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv4, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=4:i32 } : vector<4xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x8(ptr addrspace(6) {{%[0-9]+}}, i64 2, <8 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv8, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=8:i32 } : vector<8xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x16(ptr addrspace(6) {{%[0-9]+}}, i64 2, <16 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv16, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=16:i32 } : vector<16xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x32(ptr addrspace(6) {{%[0-9]+}}, i64 2, <32 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv32, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=32:i32 } : vector<32xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x64(ptr addrspace(6) {{%[0-9]+}}, i64 2, <64 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv64, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=64:i32 } : vector<64xi32>
+
+// CHECK: call void @llvm.nvvm.tcgen05.st.16x32bx2.x128(ptr addrspace(6) {{%[0-9]+}}, i64 2, <128 x i32> {{%[0-9]+}}, i1 true)
+ nvvm.tcgen05.st %tmemAddr, %stv128, %offset unpack { shape = #nvvm.tcgen05_ldst_shape<shape_16x32bx2>, num=128:i32 } : vector<128xi32>
+
+ llvm.return
+}
More information about the Mlir-commits
mailing list