[Mlir-commits] [mlir] e56d674 - [mlir][nvgpu] Add `tma.create.descriptor` to create tensor map descriptor
Guray Ozen
llvmlistbot at llvm.org
Fri Jul 21 02:33:10 PDT 2023
Author: Guray Ozen
Date: 2023-07-21T11:33:04+02:00
New Revision: e56d6745f7d7a91f7c83fcd1f52d99e38d00892a
URL: https://github.com/llvm/llvm-project/commit/e56d6745f7d7a91f7c83fcd1f52d99e38d00892a
DIFF: https://github.com/llvm/llvm-project/commit/e56d6745f7d7a91f7c83fcd1f52d99e38d00892a.diff
LOG: [mlir][nvgpu] Add `tma.create.descriptor` to create tensor map descriptor
The Op creates a tensor map descriptor object representing tiled memory region. The descriptor is used by Tensor Memory Access (TMA). The `tensor` is the source tensor to be tiled. The `boxDimensions` is the size of the tiled memory region in each dimension.
The pattern here lowers `tma.create.descriptor` to a runtime function call that eventually calls calls CUDA Driver's `cuTensorMapEncodeTiled`. For more information see below:
https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
Depends on D155453
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D155680
Added:
Modified:
mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 64a9614ee4397a..288456d76fd031 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -9,6 +9,9 @@
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Types.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/StringRef.h"
#include <functional>
@@ -47,6 +50,18 @@ using BlobGenerator =
using LoweringCallback = std::function<std::unique_ptr<llvm::Module>(
Operation *, llvm::LLVMContext &, StringRef)>;
+struct FunctionCallBuilder {
+ FunctionCallBuilder(StringRef functionName, Type returnType,
+ ArrayRef<Type> argumentTypes)
+ : functionName(functionName),
+ functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {}
+ LLVM::CallOp create(Location loc, OpBuilder &builder,
+ ArrayRef<Value> arguments) const;
+
+ StringRef functionName;
+ LLVM::LLVMFunctionType functionType;
+};
+
/// Collect a set of patterns to convert from the GPU dialect to LLVM and
/// populate converter for gpu types.
void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index da0d755328fda4..957394d7e423c9 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -600,4 +600,28 @@ def NVGPU_TmaAsyncLoadOp : NVGPU_Op<"tma.async.load", []> {
}
+def NVGPU_TmaCreateDescriptorOp : NVGPU_Op<"tma.create.descriptor", []> {
+ let summary = "TMA create descriptor";
+ let description = [{
+ The Op creates a tensor map descriptor object representing tiled memory
+ region. To do that it calls CUDA Driver's `cuTensorMapEncodeTiled`. The
+ descriptor is used by Tensor Memory Access (TMA).
+
+ The `tensor` is the source tensor to be tiled.
+
+ The `boxDimensions` is the size of the tiled memory region in each dimension.
+
+ For more information see below:
+ https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+ }];
+
+ let arguments = (ins AnyUnrankedMemRef:$tensor,
+ Variadic<Index>:$boxDimensions);
+ let results = (outs NVGPU_TensorMapDescriptor:$tensorMap);
+ let assemblyFormat = [{
+ $tensor `box` `[` $boxDimensions `]` attr-dict `:` type($tensor) `->` type($tensorMap)
+ }];
+ let hasVerifier = 1;
+}
+
#endif // NVGPU
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 6c383de2e6a69b..93034e253c065b 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -58,18 +58,6 @@ class GpuToLLVMConversionPass
void runOnOperation() override;
};
-struct FunctionCallBuilder {
- FunctionCallBuilder(StringRef functionName, Type returnType,
- ArrayRef<Type> argumentTypes)
- : functionName(functionName),
- functionType(LLVM::LLVMFunctionType::get(returnType, argumentTypes)) {}
- LLVM::CallOp create(Location loc, OpBuilder &builder,
- ArrayRef<Value> arguments) const;
-
- StringRef functionName;
- LLVM::LLVMFunctionType functionType;
-};
-
template <typename OpTy>
class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
public:
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 3e10ba59ddb2f2..95d16c290c0d21 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -8,15 +8,19 @@
#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
+#include "llvm/Support/raw_ostream.h"
namespace mlir {
#define GEN_PASS_DEF_CONVERTNVGPUTONVVMPASS
@@ -925,6 +929,121 @@ struct NVGPUTmaAsyncLoadOpLowering
return success();
}
};
+
+static Value makeI64Const(RewriterBase &rewriter, Operation *op,
+ int32_t index) {
+ return rewriter.create<LLVM::ConstantOp>(op->getLoc(),
+ rewriter.getIntegerType(64),
+ rewriter.getI32IntegerAttr(index));
+}
+
+/// Returns a Value that holds data type enum that is expected by CUDA driver.
+static Value elementTypeAsLLVMConstant(RewriterBase &rewriter, Operation *op,
+ Type type) {
+ // Enum is from CUDA driver API
+ // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html
+ enum CUtensorMapDataTypeEnum {
+ CU_TENSOR_MAP_DATA_TYPE_UINT8 = 0,
+ CU_TENSOR_MAP_DATA_TYPE_UINT16,
+ CU_TENSOR_MAP_DATA_TYPE_UINT32,
+ CU_TENSOR_MAP_DATA_TYPE_INT32,
+ CU_TENSOR_MAP_DATA_TYPE_UINT64,
+ CU_TENSOR_MAP_DATA_TYPE_INT64,
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT16,
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT32,
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT64,
+ CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
+ CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ,
+ CU_TENSOR_MAP_DATA_TYPE_TFLOAT32,
+ CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ
+ };
+
+ if (type.isUnsignedInteger(8))
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT8);
+ if (type.isUnsignedInteger(16))
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT16);
+ if (type.isUnsignedInteger(32))
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT32);
+ if (type.isUnsignedInteger(64))
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_UINT64);
+ if (type.isSignlessInteger(32))
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_INT32);
+ if (type.isSignlessInteger(64))
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_INT64);
+ if (type.isF16())
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT16);
+ if (type.isF32())
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT32);
+ if (type.isF64())
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_FLOAT64);
+ if (type.isBF16())
+ return makeI64Const(rewriter, op, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16);
+
+ llvm_unreachable("Not supported data type");
+}
+
+struct NVGPUTmaCreateDescriptorOpLowering
+ : public ConvertOpToLLVMPattern<nvgpu::TmaCreateDescriptorOp> {
+ using ConvertOpToLLVMPattern<
+ nvgpu::TmaCreateDescriptorOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(nvgpu::TmaCreateDescriptorOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ Location loc = op->getLoc();
+ LLVM::LLVMPointerType llvmPointerType = getTypeConverter()->getPointerType(
+ IntegerType::get(op->getContext(), 8));
+ Type llvmInt64Type = IntegerType::get(op->getContext(), 64);
+
+ Value tensorElementType = elementTypeAsLLVMConstant(
+ rewriter, op, op.getTensor().getType().getElementType());
+ auto promotedOperands = getTypeConverter()->promoteOperands(
+ loc, op->getOperands(), adaptor.getOperands(), rewriter);
+
+ Value boxArrayPtr = rewriter.create<LLVM::AllocaOp>(
+ loc, llvmPointerType, llvmInt64Type, makeI64Const(rewriter, op, 5));
+ for (auto [index, value] : llvm::enumerate(adaptor.getBoxDimensions())) {
+ Value gep = rewriter.create<LLVM::GEPOp>(
+ loc, llvmPointerType, llvmPointerType, boxArrayPtr,
+ makeI64Const(rewriter, op, index));
+ rewriter.create<LLVM::StoreOp>(loc, value, gep);
+ }
+
+ nvgpu::TensorMapDescriptorType desc = op.getTensorMap().getType();
+ // Set Arguments for the function call
+ SmallVector<Value> arguments;
+ arguments.push_back(promotedOperands[0]); // rank
+ arguments.push_back(promotedOperands[1]); // descriptor
+ arguments.push_back(tensorElementType); // data type
+ arguments.push_back(
+ makeI64Const(rewriter, op, (int)desc.getInterleave())); // interleave
+ arguments.push_back(
+ makeI64Const(rewriter, op, (int)desc.getSwizzle())); // swizzle
+ arguments.push_back(
+ makeI64Const(rewriter, op, (int)desc.getL2promo())); // l2promo
+ arguments.push_back(makeI64Const(rewriter, op, (int)desc.getOob())); // oob
+ arguments.push_back(boxArrayPtr); // box dimensions
+
+ // Set data types of the arguments
+ SmallVector<Type> argTypes = {
+ llvmInt64Type, /* int64_t tensorRank */
+ llvmPointerType, /* ptr */
+ llvmInt64Type, /* int64_t */
+ llvmInt64Type, /* int64_t */
+ llvmInt64Type, /* int64_t */
+ llvmInt64Type, /* int64_t */
+ llvmInt64Type, /* int64_t */
+ llvmPointerType /* ptr */
+ };
+ FunctionCallBuilder hostRegisterCallBuilder = {
+ "mgpuTensorMapEncodeTiledMemref", llvmPointerType, argTypes};
+ Value tensorMap =
+ hostRegisterCallBuilder.create(loc, rewriter, arguments).getResult();
+
+ rewriter.replaceOp(op, tensorMap);
+ return success();
+ }
+};
+
} // namespace
void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
@@ -936,6 +1055,8 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete
NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity
NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity
+ NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load
+ NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load
MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index fcb538993d1e54..24c490568a4383 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -355,6 +355,17 @@ LogicalResult TmaAsyncLoadOp::verify() {
return success();
}
+LogicalResult TmaCreateDescriptorOp::verify() {
+ if (getBoxDimensions().size() > 5) {
+ return emitError() << "Maximum 5 dimensional box is supported.";
+ }
+ nvgpu::TensorMapDescriptorType desc = getTensorMap().getType();
+ if (desc.getInterleave() != TensorMapInterleaveKind::INTERLEAVE_NONE)
+ return emitError() << "Interleave options are not supported yet.";
+
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// TableGen'd dialect, type, and op definitions
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 0ea7127e931659..fe3c229aff970a 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -254,6 +254,71 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;
}
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
+ CUtensorMap *tensorMap, // Tensor map object
+ CUtensorMapDataType tensorDataType, // Tensor data type
+ cuuint32_t tensorRank, // Dimensionality of tensor
+ void *globalAddress, // Starting address
+ const cuuint64_t *globalDim, // Tensor size (number of elements)
+ const cuuint64_t *globalStrides, // Stride size (in bytes)
+ const cuuint32_t *boxDim, // Traversal box (number of elments)
+ const cuuint32_t *elementStrides, // Traversal stride
+ CUtensorMapInterleave interleave, // Type of interleaved layout
+ CUtensorMapSwizzle swizzle, // Bank swizzling pattern
+ CUtensorMapL2promotion l2Promotion, // L2 promotion size
+ CUtensorMapFloatOOBfill oobFill // Padding zfill or NaN fill
+) {
+ ScopedContext scopedContext;
+ CUDA_REPORT_IF_ERROR(cuTensorMapEncodeTiled(
+ tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
+ globalStrides, boxDim, elementStrides, interleave, swizzle, l2Promotion,
+ oobFill));
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
+ int64_t tensorRank, // Dimensionality of tensor
+ StridedMemRefType<char, 1> *descriptor, // Starting address
+ const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
+ CUtensorMapInterleave interleave, // Type of interleaved layout
+ CUtensorMapSwizzle swizzle, // Bank swizzling pattern
+ CUtensorMapL2promotion l2Promotion, // L2 promotion size
+ CUtensorMapFloatOOBfill oobFill, // Padding zfill or NaN fill
+ int64_t *inputBoxDims // Tensor size (number of elements)
+) {
+ CUtensorMap tensorMap;
+
+ auto *globalAddress = descriptor->data;
+ uint32_t boxDim[5] = {0}, elementStrides[5] = {0};
+ uint64_t globalDim[5] = {0}, globalStrides[5] = {0};
+ uint32_t tensorRank32 = uint32_t(tensorRank);
+
+ static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
+ 4, 8, 2, 4, 4, 4};
+ for (int64_t r = 0; r < tensorRank; ++r) {
+ elementStrides[r] = uint32_t(1);
+ boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
+ globalDim[r] = static_cast<uint64_t>(descriptor->sizes[tensorRank - r - 1]);
+ }
+
+ globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType];
+ for (int r = 1; r < tensorRank - 1; r++)
+ globalStrides[r] = globalStrides[r - 1] * globalDim[1] *
+ elementSizeInBytes[tensorDataType];
+
+ ScopedContext scopedContext;
+ mgpuTensorMapEncodeTiled(&tensorMap, tensorDataType, tensorRank32,
+ globalAddress, globalDim, globalStrides, boxDim,
+ elementStrides, interleave, swizzle, l2Promotion,
+ oobFill);
+ // Copy created tensor map to device
+ CUdeviceptr dTensorMap;
+ CUDA_REPORT_IF_ERROR(cuMemAlloc(&dTensorMap, sizeof(CUtensorMap)));
+ CUDA_REPORT_IF_ERROR(cuMemcpy(dTensorMap,
+ reinterpret_cast<CUdeviceptr>(&tensorMap),
+ sizeof(CUtensorMap)));
+ return reinterpret_cast<void *>(dTensorMap);
+}
+
#ifdef MLIR_ENABLE_CUDA_CUSPARSE
///
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 73e1d5d3cf0513..18e8efe1fa9003 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -607,10 +607,10 @@ func.func @mbarrier_txcount() {
// -----
// CHECK-LABEL: func @async_tma_load
-!tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>, swizzle=none, l2promo = none, oob = nan, interleave = interleave_16b>
+!tensorMap1d = !nvgpu.tensormap.descriptor<tensor = memref<128xf32,3>, swizzle=none, l2promo = none, oob = nan, interleave = none>
!tensorMap2d = !nvgpu.tensormap.descriptor<tensor = memref<32x32xf32,3>, swizzle=swizzle_32b, l2promo = none, oob = zero, interleave = none>
!tensorMap3d = !nvgpu.tensormap.descriptor<tensor = memref<2x32x32xf32,3>, swizzle=swizzle_64b, l2promo = l2promo_64b, oob = zero, interleave = none>
-!tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = none>
+!tensorMap4d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x32x32xf32,3>, swizzle=swizzle_128b,l2promo = l2promo_128b,oob = zero, interleave = interleave_16b>
!tensorMap5d = !nvgpu.tensormap.descriptor<tensor = memref<2x2x2x32x32xf32,3>, swizzle=none, l2promo = none, oob = zero, interleave = none>
!mbarrier = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d, %tensorMap3d: !tensorMap3d, %tensorMap4d: !tensorMap4d, %tensorMap5d: !tensorMap5d,
@@ -635,18 +635,15 @@ func.func @async_tma_load(%tensorMap1d: !tensorMap1d, %tensorMap2d: !tensorMap2d
func.return
}
-// -----
-
-!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
-module @find_parent{
- func.func @main() {
- %c1 = arith.constant 1 : index
- gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
- threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) {
- // CHECK: memref.get_global @__mbarrier : memref<1xi64, 3>
- %barrier = nvgpu.mbarrier.create -> !barrierType
- gpu.terminator
- }
- func.return
- }
+func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : memref<128xf32>) {
+ %crd0 = arith.constant 64 : index
+ %crd1 = arith.constant 128 : index
+ %devicePtr2d_unranked = memref.cast %devicePtr2d : memref<64x128xf32> to memref<*xf32>
+ // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref
+ %tensorMap2d = nvgpu.tma.create.descriptor %devicePtr2d_unranked box[%crd0, %crd1] : memref<*xf32> -> !tensorMap2d
+
+ %devicePtr1d_unranked = memref.cast %devicePtr1d : memref<128xf32> to memref<*xf32>
+ // CHECK : llvm.call @mgpuTensorMapEncodeTiledMemref
+ %tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[%crd1] : memref<*xf32> -> !tensorMap1d
+ func.return
}
More information about the Mlir-commits
mailing list