[Mlir-commits] [mlir] 794979a - [mlir][gpu] Improve foreach_thread distribution

Tue Jan 17 09:24:55 PST 2023

Author: Thomas Raoux
Date: 2023-01-17T17:12:55Z
New Revision: 794979ad8cc975b4417dcf5fcce9c9e8e28e52b3

URL: https://github.com/llvm/llvm-project/commit/794979ad8cc975b4417dcf5fcce9c9e8e28e52b3
DIFF: https://github.com/llvm/llvm-project/commit/794979ad8cc975b4417dcf5fcce9c9e8e28e52b3.diff

LOG: [mlir][gpu] Improve foreach_thread distribution

Replace Ids with 0 when block dim is 1 when distributing foreach_thread.

Differential Revision: https://reviews.llvm.org/D141718

Added: 
    

Modified: 
    mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
    mlir/test/Dialect/GPU/transform-gpu.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index 6442523d624f0..69c87c433daf1 100644

--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -434,6 +434,12 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
       rewriter.create<ThreadIdOp>(loc, indexType, Dimension::x),
       rewriter.create<ThreadIdOp>(loc, indexType, Dimension::y),
       rewriter.create<ThreadIdOp>(loc, indexType, Dimension::z)};
+  // Replace ids of dimension size 1 by zero to simplify the IR.
+  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  for (size_t i : llvm::seq(size_t(0), globalBlockDims.size())) {
+    if (globalBlockDims[i] == 1)
+      threadOps[i] = zero;
+  }
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
        llvm::zip(foreachThreadOp.getThreadIndices(), threadMapping)) {

diff  --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 97c9c19f6ec05..1c337fed5cc09 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -194,3 +194,39 @@ transform.sequence failures(propagate) {
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [32]}
 }
+
+// -----
+
+!type = memref<3 x 2 x 32 x f32>
+!type1d = memref<32 x f32>
+
+// CHECK-LABEL: func.func @saxpy3d_fold_id_z(
+func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !gpu.async.token) -> !type {
+  %one = arith.constant 1 : index
+  %c12 = arith.constant 12 : index
+  %c9 = arith.constant 9 : index
+  %c7 = arith.constant 7 : index
+//  CHECK: %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-NOT:   gpu.thread_id  z
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  {
+    scf.foreach_thread (%i, %j, %k) in (%one, %c7, %c9) {
+//      CHECK:   memref.load %{{.*}}[%[[C0]],
+//      CHECK:   memref.load %{{.*}}[%[[C0]],
+        %4 = memref.load %x[%i, %j, %k] : !type
+        %5 = memref.load %y[%i, %j, %k] : !type
+        %6 = math.fma %alpha, %4, %5 : f32
+//      CHECK:   memref.store %{{.*}}, %{{.*}}[%[[C0]]
+        memref.store %6, %y[%i, %j, %k] : !type
+     }  { mapping = [#gpu.thread<z>, #gpu.thread<y>, #gpu.thread<x>] }
+    gpu.terminator
+  }
+  return %y : !type
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !pdl.operation):
+  %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+  transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false }
+}