[Mlir-commits] [mlir] [mlir][gpu] reverse parallel loop to gpu dimension mapping order. (PR #79592)

Fri Jan 26 04:52:00 PST 2024

llvmbot wrote:



@llvm/pr-subscribers-mlir

@llvm/pr-subscribers-mlir-gpu

Author: Jungwook Park (jungpark-mlir)

<details>
<summary>Changes</summary>

Map dimension id x to the inner-most loop and use y and z for the outer loops. 
This intuitively makes better sense with the gpu work dimension where x is the lowest dimension the gpu HW first iterates over to assign the tasks.
Already commented as 'TODO' in the previous code .


---
Full diff: https://github.com/llvm/llvm-project/pull/79592.diff


2 Files Affected:

- (modified) mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp (+16-14) 
- (modified) mlir/test/Dialect/GPU/mapping.mlir (+8-8) 


``````````diff

diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index 72e0ebc132e8620..06f54dc3c31bc8c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -64,12 +64,11 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
   return mappingLevel;
 }
 
-/// Computed the hardware id to use for a given mapping level. Will
-/// assign x,y and z hardware ids for the first 3 dimensions and use
-/// sequential after.
-/// TODO: Make this use x for the inner-most loop that is
-/// distributed to map to x, the next innermost to y and the next innermost to
-/// z.
+/// Computed the hardware id to use for a given mapping level. Will assign z,y
+/// and x hardware ids for the outer-most 3 dimensions respectedly and use
+/// sequential after. When the number of nesting loops is less than 3, x is
+/// first mapped to the inner-most loop and y,z will be used for the next
+/// nesting loops.
 static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
 
   if (dimension >= kNumHardwareIds || level == Sequential)
@@ -78,11 +77,11 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
   case MapGrid:
     switch (dimension) {
     case 0:
-      return Processor::BlockX;
+      return Processor::BlockZ;
     case 1:
       return Processor::BlockY;
     case 2:
-      return Processor::BlockZ;
+      return Processor::BlockX;
     default:
       return Processor::Sequential;
     }
@@ -90,11 +89,11 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
   case MapBlock:
     switch (dimension) {
     case 0:
-      return Processor::ThreadX;
+      return Processor::ThreadZ;
     case 1:
       return Processor::ThreadY;
     case 2:
-      return Processor::ThreadZ;
+      return Processor::ThreadX;
     default:
       return Processor::Sequential;
     }
@@ -115,12 +114,15 @@ static void mapParallelOp(ParallelOp parallelOp,
 
   MLIRContext *ctx = parallelOp.getContext();
   Builder b(ctx);
+  int numLoops = parallelOp.getNumLoops();
+  int dimOffset = (numLoops > 2) ? 0 : (3 - numLoops);
   SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
-  attrs.reserve(parallelOp.getNumLoops());
-  for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
+  attrs.reserve(numLoops);
+
+  for (int i = 0, e = numLoops; i < e; ++i) {
     attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
-        getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
-        b.getDimIdentityMap()));
+        getHardwareIdForMapping(mappingLevel, i + dimOffset),
+        b.getDimIdentityMap(), b.getDimIdentityMap()));
   }
   (void)setMappingAttr(parallelOp, attrs);
   ++mappingLevel;
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
index 395987317a1e6ca..bf4872d8f8cad6b 100644
--- a/mlir/test/Dialect/GPU/mapping.mlir
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -17,10 +17,10 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
 // CHECK-LABEL:   func @parallel_loop(
 // CHECK:           scf.parallel
 // CHECK:             scf.parallel
-// CHECK:      {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME:             #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK:      {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME:             #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// CHECK:      {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME:             #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// CHECK:      {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME:             #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
 // CHECK-NOT: mapping
 
 // -----
@@ -50,12 +50,12 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK:      {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK:      {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME:             #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME:             #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK:      {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK:      {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME:             #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME:             #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
 // CHECK-SAME:             #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
 // CHECK-NOT: mapping

``````````

</details>


https://github.com/llvm/llvm-project/pull/79592