[Mlir-commits] [mlir] 70c2e06 - [mlir][nvgpu] Add nvgpu.tma.async.load and nvgpu.tma.descriptor
Guray Ozen
llvmlistbot at llvm.org
Fri Jul 21 01:23:30 PDT 2023
Author: Guray Ozen
Date: 2023-07-21T10:23:25+02:00
New Revision: 70c2e0618a0f3c09ed7149d88b4987b932eb6705
URL: https://github.com/llvm/llvm-project/commit/70c2e0618a0f3c09ed7149d88b4987b932eb6705
DIFF: https://github.com/llvm/llvm-project/commit/70c2e0618a0f3c09ed7149d88b4987b932eb6705.diff
LOG: [mlir][nvgpu] Add nvgpu.tma.async.load and nvgpu.tma.descriptor
This work adds `nvgpu.tma.async.load` Op that requests tma load asyncronusly using mbarrier object.
It also creates nvgpu.tma.descriptor type. The type is supposed be created by `cuTensorMapEncodeTiled` cuda drivers api.
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D155453
Added:
Modified:
mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
index 5ca5707fe12ea7..13d754ca063165 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/CMakeLists.txt
@@ -1,2 +1,17 @@
add_mlir_dialect(NVGPU nvgpu)
add_mlir_doc(NVGPU NVGPU Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS NVGPU.td)
+mlir_tablegen(NVGPUEnums.h.inc -gen-enum-decls)
+mlir_tablegen(NVGPUEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRNVGPUEnumsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS NVGPU.td)
+mlir_tablegen(NVGPUAttrDefs.h.inc -gen-attrdef-decls)
+mlir_tablegen(NVGPUAttrDefs.cpp.inc -gen-attrdef-defs)
+add_public_tablegen_target(MLIRNVGPUAttributesIncGen)
+
+set(LLVM_TARGET_DEFINITIONS NVGPU.td)
+mlir_tablegen(NVGPUAttrTypes.h.inc -gen-typedef-decls)
+mlir_tablegen(NVGPUAttrTypes.cpp.inc -gen-typedef-decls)
+add_public_tablegen_target(MLIRNVGPUTypesIncGen)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index eb0bdee0b55f17..da0d755328fda4 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -23,6 +23,7 @@
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/AttrTypeBase.td"
include "mlir/IR/OpBase.td"
+include "mlir/IR/EnumAttr.td"
def NVGPU_Dialect : Dialect {
let name = "nvgpu";
@@ -61,6 +62,58 @@ def NVGPU_Dialect : Dialect {
}];
}
+//===----------------------------------------------------------------------===//
+// NVGPU Attribute Definitions
+//===----------------------------------------------------------------------===//
+
+def TensorMapSwizzleNone : I32EnumAttrCase<"SWIZZLE_NONE", 0, "none">;
+def TensorMapSwizzle32B : I32EnumAttrCase<"SWIZZLE_32B", 1, "swizzle_32b">;
+def TensorMapSwizzle64B : I32EnumAttrCase<"SWIZZLE_64B", 2, "swizzle_64b">;
+def TensorMapSwizzle128B : I32EnumAttrCase<"SWIZZLE_128B", 3, "swizzle_128b">;
+def TensorMapSwizzleKind : I32EnumAttr<"TensorMapSwizzleKind",
+ "Tensor map swizzling mode of shared memory banks",
+ [ TensorMapSwizzleNone, TensorMapSwizzle32B, TensorMapSwizzle64B,
+ TensorMapSwizzle128B]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::nvgpu";
+}
+
+def TensorMapL2PromoNone : I32EnumAttrCase<"L2PROMO_NONE", 0, "none">;
+def TensorMapL2Promo64B : I32EnumAttrCase<"L2PROMO_64B", 1, "l2promo_64b">;
+def TensorMapL2Promo128B : I32EnumAttrCase<"L2PROMO_128B", 2, "l2promo_128b">;
+def TensorMapL2Promo256B : I32EnumAttrCase<"L2PROMO_256B", 3, "l2promo_256b">;
+def TensorMapL2PromoKind : I32EnumAttr<"TensorMapL2PromoKind",
+ "Tensor map L2 promotion type",
+ [ TensorMapL2PromoNone, TensorMapL2Promo64B, TensorMapL2Promo128B,
+ TensorMapL2Promo256B]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::nvgpu";
+}
+
+def TensorMapOOBZero : I32EnumAttrCase<"OOB_ZERO", 0, "zero">;
+def TensorMapOOBNaN : I32EnumAttrCase<"OOB_NAN", 1, "nan">;
+def TensorMapOOBKind : I32EnumAttr<"TensorMapOOBKind",
+ "Tensor map out-of-bounds fill type",
+ [ TensorMapOOBZero, TensorMapOOBNaN]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::nvgpu";
+}
+
+def TensorMapInterleaveNone : I32EnumAttrCase<"INTERLEAVE_NONE", 0, "none">;
+def TensorMapInterleave16B : I32EnumAttrCase<"INTERLEAVE_16B", 1, "interleave_16b">;
+def TensorMapInterleave32B : I32EnumAttrCase<"INTERLEAVE_32B", 2, "interleave_32b">;
+def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind",
+ "Tensor map interleave layout type",
+ [ TensorMapInterleaveNone, TensorMapInterleave16B, TensorMapInterleave32B]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::nvgpu";
+}
+
+def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">;
+def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">;
+def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
+def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
+
//===----------------------------------------------------------------------===//
// NVGPU Type Definitions
//===----------------------------------------------------------------------===//
@@ -100,6 +153,21 @@ def NVGPU_MBarrier : NVGPU_Type<"MBarrier", "mbarrier.barrier", []> {
def NVGPU_MBarrierToken : NVGPU_Type<"MBarrierToken", "mbarrier.token", []> { }
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map
+def NVGPU_TensorMapDescriptor : NVGPU_Type<"TensorMapDescriptor", "tensormap.descriptor", []> {
+ let summary = "TensorMap descriptor";
+ let parameters = (ins "MemRefType":$tensor,
+ EnumParameter<TensorMapSwizzleKind>:$swizzle,
+ EnumParameter<TensorMapL2PromoKind>:$l2promo,
+ EnumParameter<TensorMapOOBKind>:$oob,
+ EnumParameter<TensorMapInterleaveKind>:$interleave);
+ let description = [{
+ `nvgpu.tma.descriptor` is a type that represents a TMA descriptor. It is
+ 128-byte object either in constant space or kernel paramater.
+ }];
+ let assemblyFormat = "`<` struct(params) `>`";
+}
+
//===----------------------------------------------------------------------===//
// NVGPU Op Definitions
//===----------------------------------------------------------------------===//
@@ -509,4 +577,27 @@ def NVGPU_MBarrierTryWaitParityOp : NVGPU_Op<"mbarrier.try_wait.parity", []> {
let assemblyFormat = "$barrier `,` $phase `,` $ticks attr-dict `:` type($barrier)";
}
+def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
+ let summary = "TMA asynchronous load";
+ let description = [{
+ The Op loads a tile memory region from global memory to shared memory by
+ Tensor Memory Access (TMA).
+
+ `$tensorMapDescriptor` is tensor map descriptor which has information about
+ tile shape. The descriptor is created by `nvgpu.tma.create.descriptor`
+
+ The Op uses `$barrier` mbarrier based completion mechanism.
+ }];
+ let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
+ NVGPU_MBarrier:$barrier,
+ NVGPU_TensorMapDescriptor:$tensorMapDescriptor,
+ Variadic<Index>:$coordinates);
+ let assemblyFormat = [{
+ $tensorMapDescriptor `[` $coordinates `]` `,` $barrier `to` $dst
+ attr-dict `:` type($tensorMapDescriptor) `,` type($barrier) `->` type($dst)
+ }];
+ let hasVerifier = 1;
+
+}
+
#endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index ede8b781b2ca13..192afcb2dba791 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -19,6 +19,11 @@
#include "mlir/IR/OpDefinition.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.h.inc"
+
#define GET_TYPEDEF_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.h.inc"
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 26be5c03546c29..512f3ce9ecb7b1 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -413,6 +413,9 @@ struct ConvertNVGPUToNVVMPass
converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
return converter.convertType(createMBarrierMemrefType(rewriter, type));
});
+ converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
+ return converter.getPointerType(type.getTensor().getElementType());
+ });
populateNVGPUToNVVMConversionPatterns(converter, patterns);
LLVMConversionTarget target(getContext());
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
@@ -770,11 +773,7 @@ struct NVGPUMBarrierInitLowering
Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
op.getBarrier(), adaptor.getBarrier());
- Value count = adaptor.getCount();
- if (!adaptor.getCount().getType().isInteger(32)) {
- count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
- rewriter.getI32Type(), count);
- }
+ Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
if (isMbarrierShared(op.getBarrier().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierInitSharedOp>(op, barrier,
@@ -822,11 +821,7 @@ struct NVGPUMBarrierArriveNoCompleteLowering
op.getBarrier(), adaptor.getBarrier());
Type tokenType = getTypeConverter()->convertType(
nvgpu::MBarrierTokenType::get(op->getContext()));
- Value count = adaptor.getCount();
- if (!adaptor.getCount().getType().isInteger(32)) {
- count = rewriter.create<LLVM::TruncOp>(op->getLoc(),
- rewriter.getI32Type(), count);
- }
+ Value count = truncToI32(rewriter, op->getLoc(), adaptor.getCount());
if (isMbarrierShared(op.getBarrier().getType())) {
rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
op, tokenType, barrier, count);
@@ -910,6 +905,27 @@ struct NVGPUMBarrierTryWaitParityLowering
}
};
+struct NVGPUTmaAsyncLoadOpLowering
+ : public ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp> {
+ using ConvertOpToLLVMPattern<nvgpu::TmaAsyncLoadOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto dest = rewriter.create<LLVM::ExtractValueOp>(op->getLoc(),
+ adaptor.getDst(), 1);
+ Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
+ op.getBarrier(), adaptor.getBarrier());
+
+ SmallVector<Value> coords = adaptor.getCoordinates();
+ for (auto [index, value] : llvm::enumerate(coords)) {
+ coords[index] = truncToI32(rewriter, op->getLoc(), value);
+ }
+
+ rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp>(
+ op, dest, adaptor.getTensorMapDescriptor(), barrier, coords);
+ return success();
+ }
+};
} // namespace
void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
@@ -922,6 +938,7 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity
NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
+ NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load
MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
NVGPUMmaSparseSyncLowering>(converter);
diff --git a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
index e95684a7f079dd..4d47ce4746dbbc 100644
--- a/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/NVGPU/IR/CMakeLists.txt
@@ -6,6 +6,9 @@ add_mlir_dialect_library(MLIRNVGPUDialect
DEPENDS
MLIRNVGPUIncGen
+ MLIRNVGPUEnumsIncGen
+ MLIRNVGPUAttributesIncGen
+ MLIRNVGPUTypesIncGen
LINK_LIBS PUBLIC
MLIRGPUDialect
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 07c29541faf416..fcb538993d1e54 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -14,21 +14,31 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/IR/Verifier.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/TypeSwitch.h"
using namespace mlir;
using namespace mlir::nvgpu;
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"
+
void nvgpu::NVGPUDialect::initialize() {
addTypes<
#define GET_TYPEDEF_LIST
#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"
>();
+ addAttributes<
+#define GET_ATTRDEF_LIST
+#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc"
+ >();
addOperations<
#define GET_OP_LIST
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
@@ -320,11 +330,39 @@ LogicalResult LdMatrixOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// NVGPU_TmaAsyncLoadOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TmaAsyncLoadOp::verify() {
+ // Destination memref
+ auto dstMemref = llvm::cast<MemRefType>(getDst().getType());
+ if (!NVGPUDialect::hasSharedMemoryAddressSpace(dstMemref)) {
+ return emitError()
+ << "The operation stores data to shared memory, but "
+ "the destination memref does not have a memory space of "
+ << NVGPUDialect::kSharedMemoryAddressSpace;
+ }
+ if (getCoordinates().size() > 5) {
+ return emitError() << "Maximum 5 coordinates are supported.";
+ }
+ if (getCoordinates().size() != size_t(dstMemref.getRank())) {
+ return emitError() << "Destination memref rank is "
+ << size_t(dstMemref.getRank()) << " but there are "
+ << getCoordinates().size()
+ << " coordinates. They must match.";
+ }
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// TableGen'd dialect, type, and op definitions
//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"
+#define GET_ATTRDEF_CLASSES
+#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc"
+
+#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"
#define GET_OP_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index c7a0c7f4b3ea94..f22a9e7ed60aef 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -559,7 +559,6 @@ func.func @mbarrier_nocomplete() {
func.return
}
-
// -----
!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
!tokenType = !nvgpu.mbarrier.token
@@ -603,4 +602,36 @@ func.func @mbarrier_txcount() {
nvgpu.mbarrier.try_wait.parity %barrier, %phase, %ticks : !barrierType
func.return
-}
\ No newline at end of file
+}
+
+// -----
+
+// CHECK-LABEL: func @async_tma_load
+!tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>, swizzle=none, l2promo = none, oob = nan, interleave = interleave_16b>
+!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
+!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none>
+!tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none>
+!tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none>
+!mbarrier = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
+ %buffer1d: memref<128xf32,3>,
+ %buffer2d: memref<32x32xf32,3>,
+ %buffer3d: memref<2x32x32xf32,3>,
+ %buffer4d: memref<2x2x32x32xf32,3>,
+ %buffer5d: memref<2x2x2x32x32xf32,3>,
+ %mbarrier: !mbarrier) {
+ %crd0 = arith.constant 0 : index
+ %crd1 = arith.constant 0 : index
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}]
+ nvgpu.tma.async.load %tensorMap1d[%crd0], %mbarrier to %buffer1d : !tensorMap1d, !mbarrier -> memref<128xf32,3>
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}]
+ nvgpu.tma.async.load %tensorMap2d[%crd0, %crd1], %mbarrier to %buffer2d : !tensorMap2d, !mbarrier -> memref<32x32xf32,3>
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}]
+ nvgpu.tma.async.load %tensorMap3d[%crd0, %crd1, %crd0], %mbarrier to %buffer3d : !tensorMap3d, !mbarrier -> memref<2x32x32xf32,3>
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
+ nvgpu.tma.async.load %tensorMap4d[%crd0, %crd1, %crd1, %crd0], %mbarrier to %buffer4d : !tensorMap4d, !mbarrier -> memref<2x2x32x32xf32,3>
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %{{.*}}, %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}]
+ nvgpu.tma.async.load %tensorMap5d[%crd0, %crd1, %crd1, %crd0, %crd0], %mbarrier to %buffer5d : !tensorMap5d, !mbarrier -> memref<2x2x2x32x32xf32,3>
+ func.return
+}
+
More information about the Mlir-commits
mailing list