[Mlir-commits] [mlir] [mlir][gpu] reverse parallel loop to gpu dimension mapping order. (PR #79592)
Jungwook Park
llvmlistbot at llvm.org
Mon Jan 29 02:39:19 PST 2024
https://github.com/jungpark-mlir updated https://github.com/llvm/llvm-project/pull/79592
>From f8528ce894ef2a5501b930d78fd02593a713c57f Mon Sep 17 00:00:00 2001
From: jungpark-mlir <jungwook at jungwook-22.04>
Date: Fri, 26 Jan 2024 12:37:30 +0000
Subject: [PATCH 1/2] [mlir][gpu] reverse parallel loop to gpu dimension
mapping order. Map dimension id x to the inner-most loop and use y and z for
the outer loops. Implementing the mapping previously commented in the 'TODO'
---
.../GPU/Transforms/ParallelLoopMapper.cpp | 30 ++++++++++---------
mlir/test/Dialect/GPU/mapping.mlir | 16 +++++-----
2 files changed, 24 insertions(+), 22 deletions(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index 72e0ebc132e8620..06f54dc3c31bc8c 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -64,12 +64,11 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
return mappingLevel;
}
-/// Computed the hardware id to use for a given mapping level. Will
-/// assign x,y and z hardware ids for the first 3 dimensions and use
-/// sequential after.
-/// TODO: Make this use x for the inner-most loop that is
-/// distributed to map to x, the next innermost to y and the next innermost to
-/// z.
+/// Computed the hardware id to use for a given mapping level. Will assign z,y
+/// and x hardware ids for the outer-most 3 dimensions respectedly and use
+/// sequential after. When the number of nesting loops is less than 3, x is
+/// first mapped to the inner-most loop and y,z will be used for the next
+/// nesting loops.
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
if (dimension >= kNumHardwareIds || level == Sequential)
@@ -78,11 +77,11 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
case MapGrid:
switch (dimension) {
case 0:
- return Processor::BlockX;
+ return Processor::BlockZ;
case 1:
return Processor::BlockY;
case 2:
- return Processor::BlockZ;
+ return Processor::BlockX;
default:
return Processor::Sequential;
}
@@ -90,11 +89,11 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
case MapBlock:
switch (dimension) {
case 0:
- return Processor::ThreadX;
+ return Processor::ThreadZ;
case 1:
return Processor::ThreadY;
case 2:
- return Processor::ThreadZ;
+ return Processor::ThreadX;
default:
return Processor::Sequential;
}
@@ -115,12 +114,15 @@ static void mapParallelOp(ParallelOp parallelOp,
MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
+ int numLoops = parallelOp.getNumLoops();
+ int dimOffset = (numLoops > 2) ? 0 : (3 - numLoops);
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
- attrs.reserve(parallelOp.getNumLoops());
- for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
+ attrs.reserve(numLoops);
+
+ for (int i = 0, e = numLoops; i < e; ++i) {
attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
- getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
- b.getDimIdentityMap()));
+ getHardwareIdForMapping(mappingLevel, i + dimOffset),
+ b.getDimIdentityMap(), b.getDimIdentityMap()));
}
(void)setMappingAttr(parallelOp, attrs);
++mappingLevel;
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
index 395987317a1e6ca..bf4872d8f8cad6b 100644
--- a/mlir/test/Dialect/GPU/mapping.mlir
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -17,10 +17,10 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
// CHECK-LABEL: func @parallel_loop(
// CHECK: scf.parallel
// CHECK: scf.parallel
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK-NOT: mapping
// -----
@@ -50,12 +50,12 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// CHECK-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
// CHECK-NOT: mapping
>From af16827659983b13a38e37cd129d85eb206ba574 Mon Sep 17 00:00:00 2001
From: jungpark-mlir <jungwook at jungwook-22.04>
Date: Mon, 29 Jan 2024 10:39:01 +0000
Subject: [PATCH 2/2] format
---
mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index 06f54dc3c31bc8c..f45250b5a97763b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -118,7 +118,6 @@ static void mapParallelOp(ParallelOp parallelOp,
int dimOffset = (numLoops > 2) ? 0 : (3 - numLoops);
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
attrs.reserve(numLoops);
-
for (int i = 0, e = numLoops; i < e; ++i) {
attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
getHardwareIdForMapping(mappingLevel, i + dimOffset),
More information about the Mlir-commits
mailing list