[Mlir-commits] [mlir] [mlir][gpu] reverse parallel loop to gpu dimension mapping order. (PR #79592)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Fri Jan 26 04:52:00 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-mlir-gpu
Author: Jungwook Park (jungpark-mlir)
<details>
<summary>Changes</summary>
Map dimension id x to the inner-most loop and use y and z for the outer loops.
This intuitively makes better sense with the gpu work dimension where x is the lowest dimension the gpu HW first iterates over to assign the tasks.
Already commented as 'TODO' in the previous code .
---
Full diff: https://github.com/llvm/llvm-project/pull/79592.diff
2 Files Affected:
- (modified) mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp (+16-14)
- (modified) mlir/test/Dialect/GPU/mapping.mlir (+8-8)
``````````diff
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index 72e0ebc132e8620..06f54dc3c31bc8c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -64,12 +64,11 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
return mappingLevel;
}
-/// Computed the hardware id to use for a given mapping level. Will
-/// assign x,y and z hardware ids for the first 3 dimensions and use
-/// sequential after.
-/// TODO: Make this use x for the inner-most loop that is
-/// distributed to map to x, the next innermost to y and the next innermost to
-/// z.
+/// Computed the hardware id to use for a given mapping level. Will assign z,y
+/// and x hardware ids for the outer-most 3 dimensions respectedly and use
+/// sequential after. When the number of nesting loops is less than 3, x is
+/// first mapped to the inner-most loop and y,z will be used for the next
+/// nesting loops.
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
if (dimension >= kNumHardwareIds || level == Sequential)
@@ -78,11 +77,11 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
case MapGrid:
switch (dimension) {
case 0:
- return Processor::BlockX;
+ return Processor::BlockZ;
case 1:
return Processor::BlockY;
case 2:
- return Processor::BlockZ;
+ return Processor::BlockX;
default:
return Processor::Sequential;
}
@@ -90,11 +89,11 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
case MapBlock:
switch (dimension) {
case 0:
- return Processor::ThreadX;
+ return Processor::ThreadZ;
case 1:
return Processor::ThreadY;
case 2:
- return Processor::ThreadZ;
+ return Processor::ThreadX;
default:
return Processor::Sequential;
}
@@ -115,12 +114,15 @@ static void mapParallelOp(ParallelOp parallelOp,
MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
+ int numLoops = parallelOp.getNumLoops();
+ int dimOffset = (numLoops > 2) ? 0 : (3 - numLoops);
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
- attrs.reserve(parallelOp.getNumLoops());
- for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
+ attrs.reserve(numLoops);
+
+ for (int i = 0, e = numLoops; i < e; ++i) {
attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
- getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
- b.getDimIdentityMap()));
+ getHardwareIdForMapping(mappingLevel, i + dimOffset),
+ b.getDimIdentityMap(), b.getDimIdentityMap()));
}
(void)setMappingAttr(parallelOp, attrs);
++mappingLevel;
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
index 395987317a1e6ca..bf4872d8f8cad6b 100644
--- a/mlir/test/Dialect/GPU/mapping.mlir
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -17,10 +17,10 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
// CHECK-LABEL: func @parallel_loop(
// CHECK: scf.parallel
// CHECK: scf.parallel
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK-NOT: mapping
// -----
@@ -50,12 +50,12 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK-NOT: mapping
``````````
</details>
https://github.com/llvm/llvm-project/pull/79592
More information about the Mlir-commits
mailing list