[Mlir-commits] [mlir] 50a76a7 - [MLIR][NVGPU] Handling Offset in `nvgpu.tma.async.load`
Guray Ozen
llvmlistbot at llvm.org
Tue Aug 8 04:25:06 PDT 2023
Author: Guray Ozen
Date: 2023-08-08T13:25:00+02:00
New Revision: 50a76a7d73dbe587aa07f6f9e1ec336612c16c5f
URL: https://github.com/llvm/llvm-project/commit/50a76a7d73dbe587aa07f6f9e1ec336612c16c5f
DIFF: https://github.com/llvm/llvm-project/commit/50a76a7d73dbe587aa07f6f9e1ec336612c16c5f.diff
LOG: [MLIR][NVGPU] Handling Offset in `nvgpu.tma.async.load`
When using `nvgpu.tma.async.load` Op to asynchronously load data into shared memory, it fails to account for provided offsets, potentially leading to incorrect memory access. Using offset is common practice especially with the dynamic shared memory. This work addresses the problem by ensuring proper consideration of offsets.
Reviewed By: nicolasvasilache
Differential Revision: https://reviews.llvm.org/D157380
Added:
Modified:
mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
Removed:
################################################################################
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 95d16c290c0d21..68a9251bc91645 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -914,8 +914,9 @@ struct NVGPUTmaAsyncLoadOpLowering
LogicalResult
matchAndRewrite(nvgpu::TmaAsyncLoadOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto dest = rewriter.create<LLVM::ExtractValueOp>(op->getLoc(),
- adaptor.getDst(), 1);
+ auto srcMemrefType = cast<MemRefType>(op.getDst().getType());
+ Value dest = getStridedElementPtr(op->getLoc(), srcMemrefType,
+ adaptor.getDst(), {}, rewriter);
Value barrier = getMbarrierPtr(rewriter, *getTypeConverter(),
op.getBarrier(), adaptor.getBarrier());
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 18e8efe1fa9003..cb2b208eb13bd1 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -647,3 +647,35 @@ func.func @create_tensor_map(%devicePtr2d : memref<64x128xf32>, %devicePtr1d : m
%tensorMap1d = nvgpu.tma.create.descriptor %devicePtr1d_unranked box[%crd1] : memref<*xf32> -> !tensorMap1d
func.return
}
+
+// -----
+
+!lhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+!rhsTensorMap = !nvgpu.tensormap.descriptor<tensor = memref<64x128xf16, strided<[128, 1], offset: 8192>, 3>, swizzle = swizzle_128b, l2promo = none, oob = zero, interleave = none>
+
+!barrierType = !nvgpu.mbarrier.barrier<memorySpace = #gpu.address_space<workgroup>>
+
+!shmemlhs = memref<128x64xf16,3>
+!shmemrhs = memref<64x128xf16, strided<[128, 1], offset: 8192>, 3>
+
+module @mymodule {
+ // Dynamic Shared memory
+ memref.global "private" @dynamicShmem : memref<0xf16,3>
+
+ func.func @async_tma_load(%lhsTensorMap: !lhsTensorMap, %rhsTensorMap: !rhsTensorMap, %mbarrier: !barrierType) {
+ %c0 = arith.constant 0 : index
+ %dynamicMem = memref.get_global @dynamicShmem : memref<0xf16, 3>
+ %lhsShmem = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [128,64], strides: [64,1] : memref<0xf16, 3> to !shmemlhs
+ %rhsShmem2 = memref.reinterpret_cast %dynamicMem to offset: [0], sizes: [2,64,128], strides: [8192,128,1] : memref<0xf16, 3> to memref<2x64x128xf16,3>
+ %rhsShmem3 = memref.subview %rhsShmem2[1,0,0][1, 64, 128][1, 1, 1] : memref<2x64x128xf16,3> to memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3>
+ %rhsShmem = memref.subview %rhsShmem3[0,0,0][1, 64, 128][1, 1, 1] : memref<1x64x128xf16, strided<[8192, 128, 1], offset: 8192>, 3> to !shmemrhs
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global
+ nvgpu.tma.async.load %lhsTensorMap[%c0, %c0], %mbarrier to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
+ // CHECK: %[[desc:.+]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
+ // CHECK: %[[c8192:.+]] = llvm.mlir.constant(8192 : index) : i64
+ // CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
+ // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i32, i32
+ nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
+ return
+ }
+}
More information about the Mlir-commits
mailing list