[Mlir-commits] [mlir] [mlir][gpu] Generate multiple rank-specializations for tensor map cre… (PR #74082)
Adam Paszke
llvmlistbot at llvm.org
Fri Dec 1 06:27:34 PST 2023
https://github.com/apaszke updated https://github.com/llvm/llvm-project/pull/74082
>From ae79cc82698b3211942452c0f305b00518519466 Mon Sep 17 00:00:00 2001
From: Adam Paszke <apaszke at google.com>
Date: Fri, 1 Dec 2023 14:22:09 +0000
Subject: [PATCH] [mlir][gpu] Generate multiple rank-specializations for tensor
map creation
The previous code was technically incorrect in that the type indicated
that the memref only has 1 dimension, while the code below was happily
dereferencing the size array out of bounds. Now, if the compiler doesn't
get too smart about optimizations, this code *might even work*. But, if
the compiler realizes that the array has 1 element it might starrt doing
silly things. This generates a specialization per each supported rank,
making sure we don't do any UB.
---
.../ExecutionEngine/CudaRuntimeWrappers.cpp | 43 +++++++++++++++++--
1 file changed, 40 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index b8ac9ab90a9f3b8..5ec87d58cc57f8b 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -423,9 +423,24 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuTensorMapEncodeTiled(
elementStrides[4], interleave, swizzle, l2Promotion, oobFill);
}
+namespace {
+
+template <int rank>
+void mgpuGetMemRefDataAndShape(void *raw_descriptor, char **addr,
+ uint64_t *globalDim) {
+ auto descriptor =
+ reinterpret_cast<StridedMemRefType<char, rank> *>(raw_descriptor);
+ *addr = descriptor->data;
+ for (int i = 0; i < rank; ++i) {
+ globalDim[i] = static_cast<uint64_t>(descriptor->sizes[rank - i - 1]);
+ }
+}
+
+} // namespace
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
int64_t tensorRank, // Dimensionality of tensor
- StridedMemRefType<char, 1> *descriptor, // Starting address
+ void *ranked_descriptor, // Ranked MemRef descriptor
const CUtensorMapDataType tensorDataType, // Stride size (in bytes)
CUtensorMapInterleave interleave, // Type of interleaved layout
CUtensorMapSwizzle swizzle, // Bank swizzling pattern
@@ -435,17 +450,39 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuTensorMapEncodeTiledMemref(
) {
CUtensorMap tensorMap;
- auto *globalAddress = descriptor->data;
uint32_t boxDim[5] = {1, 1, 1, 1, 1}, elementStrides[5] = {1, 1, 1, 1, 1};
uint64_t globalDim[5] = {1, 1, 1, 1, 1}, globalStrides[5] = {0};
uint32_t tensorRank32 = uint32_t(tensorRank);
+ char *globalAddress = nullptr;
+ switch (tensorRank) {
+ case 1:
+ mgpuGetMemRefDataAndShape<1>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 2:
+ mgpuGetMemRefDataAndShape<2>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 3:
+ mgpuGetMemRefDataAndShape<3>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 4:
+ mgpuGetMemRefDataAndShape<4>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ case 5:
+ mgpuGetMemRefDataAndShape<5>(ranked_descriptor, &globalAddress, globalDim);
+ break;
+ default:
+ fprintf(
+ stderr,
+ "'mgpuTensorMapEncodeTiledMemref' failed with 'rank is too high'\n");
+ return NULL;
+ }
+
static const int elementSizeInBytes[] = {1, 2, 4, 4, 8, 8, 2,
4, 8, 2, 4, 4, 4};
for (int64_t r = 0; r < tensorRank; ++r) {
elementStrides[r] = uint32_t(1);
boxDim[r] = static_cast<uint32_t>(inputBoxDims[tensorRank - r - 1]);
- globalDim[r] = static_cast<uint64_t>(descriptor->sizes[tensorRank - r - 1]);
}
globalStrides[0] = globalDim[0] * elementSizeInBytes[tensorDataType];
More information about the Mlir-commits
mailing list