[Mlir-commits] [mlir] [mlir][gpu] Add innermost-first policy when mapping loops to GPU IDs (PR #160634)
Georgios Pinitas
llvmlistbot at llvm.org
Thu Sep 25 03:51:47 PDT 2025
https://github.com/GeorgeARM updated https://github.com/llvm/llvm-project/pull/160634
>From 4bd408995cd92f4ada8e9a8093116f1a100ee114 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas at arm.com>
Date: Thu, 25 Sep 2025 04:10:42 +0100
Subject: [PATCH 1/2] [mlir][gpu] Add innermost-first policy when mapping loops
to GPU IDs
Extend the `GpuMapParallelLoopsPass` with a new `--mapping-policy`
option. While the existing behavior is retained as the default option
namely `outermost-first`; support for a new policy is added
`innermost-first` that maps the loops in reverse order.
Signed-off-by: Georgios Pinitas <georgios.pinitas at arm.com>
---
.../mlir/Dialect/GPU/Transforms/Passes.td | 11 +++
.../GPU/Transforms/ParallelLoopMapper.cpp | 62 +++++++++++---
mlir/test/Dialect/GPU/mapping.mlir | 80 +++++++++++++------
3 files changed, 117 insertions(+), 36 deletions(-)
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 187ac9aa18aac..0c8a0c7a677ab 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -39,8 +39,19 @@ def GpuMapParallelLoopsPass
encountered to the local workgroup. Within each mapping, the first three
dimensions are mapped to x/y/z hardware ids and all following dimensions are
mapped to sequential loops.
+
+ Ordering of the loop mapping against the different dimensions is controlled
+ by the `mapping-policy` option.
+ Two policies are supported:
+ 1. `outermost-first` (default): the outermost loop maps to X, then Y
+ and finally Z.
+ 2. `innermost-first`: the innermost loop maps to X, then Y and finally Z.
}];
let dependentDialects = ["mlir::gpu::GPUDialect"];
+ let options = [Option<"mappingPolicyStr", "mapping-policy", "std::string",
+ /*default=*/"\"outermost-first\"",
+ "Policy outlining how to assign loops to GPU dimensions."
+ "Supported values are `outermost-first` and `innermost-first`.">];
}
def GpuEliminateBarriers
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index a098e721303a8..7b335d1d5f224 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -52,6 +52,7 @@ gpu::setMappingAttr(ParallelOp ploopOp,
namespace gpu {
namespace {
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
+enum class MappingPolicy { OutermostFirst, InnermostFirst };
} // namespace
static constexpr int kNumHardwareIds = 3;
@@ -65,16 +66,29 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
return mappingLevel;
}
+// Map the policy string to a typed mapping policy.
+static FailureOr<MappingPolicy> getMappingPolicyFromStr(StringRef policy) {
+ std::string policyCanonical = policy.trim().lower();
+
+ auto option =
+ llvm::StringSwitch<std::optional<MappingPolicy>>(policyCanonical)
+ .Case("innermost-first", MappingPolicy::InnermostFirst)
+ .Case("outermost-first", MappingPolicy::OutermostFirst)
+ .Default(std::nullopt);
+
+ if (!option)
+ return failure();
+ return *option;
+}
+
/// Computed the hardware id to use for a given mapping level. Will
/// assign x,y and z hardware ids for the first 3 dimensions and use
/// sequential after.
-/// TODO: Make this use x for the inner-most loop that is
-/// distributed to map to x, the next innermost to y and the next innermost to
-/// z.
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
if (dimension >= kNumHardwareIds || level == Sequential)
return Processor::Sequential;
+
switch (level) {
case MapGrid:
switch (dimension) {
@@ -107,20 +121,34 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
/// Add mapping information to the given parallel loop. Do not add
/// mapping information if the loop already has it. Also, don't
/// start a mapping at a nested loop.
-static void mapParallelOp(ParallelOp parallelOp,
- MappingLevel mappingLevel = MapGrid) {
+static void
+mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid,
+ MappingPolicy mappingPolicy = MappingPolicy::OutermostFirst) {
// Do not try to add a mapping to already mapped loops or nested loops.
if (parallelOp->getAttr(getMappingAttrName()) ||
((mappingLevel == MapGrid) && parallelOp->getParentOfType<ParallelOp>()))
return;
+ const int numLoops = static_cast<int>(parallelOp.getNumLoops());
+ const int loopsToMap = std::min(numLoops, kNumHardwareIds);
+
MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
- attrs.reserve(parallelOp.getNumLoops());
- for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
+ attrs.reserve(numLoops);
+
+ for (int i = 0, e = numLoops; i < e; ++i) {
+
+ // Determine the mapping to use for this loop.
+ int hwMapping = kNumHardwareIds;
+ if (i < loopsToMap) {
+ hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst)
+ ? i
+ : (loopsToMap - 1 - i);
+ }
+
attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
- getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
+ getHardwareIdForMapping(mappingLevel, hwMapping), b.getDimIdentityMap(),
b.getDimIdentityMap()));
}
(void)setMappingAttr(parallelOp, attrs);
@@ -129,16 +157,30 @@ static void mapParallelOp(ParallelOp parallelOp,
// walk but just iterate over the operations.
for (Operation &op : *parallelOp.getBody()) {
if (ParallelOp nested = dyn_cast<ParallelOp>(op))
- mapParallelOp(nested, mappingLevel);
+ mapParallelOp(nested, mappingLevel, mappingPolicy);
}
}
namespace {
struct GpuMapParallelLoopsPass
: public impl::GpuMapParallelLoopsPassBase<GpuMapParallelLoopsPass> {
+ using Base::Base;
+
void runOnOperation() override {
+ // Parse the mapping policy.
+ auto policyOrFailure = getMappingPolicyFromStr(mappingPolicyStr);
+ if (failed(policyOrFailure)) {
+ getOperation()->emitError() << "Invalid mapping policy specified.";
+ return signalPassFailure();
+ }
+
+ auto policy = *policyOrFailure;
+ MappingLevel topLevel = MappingLevel::MapGrid;
+
for (Region ®ion : getOperation()->getRegions()) {
- region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
+ region.walk([&](ParallelOp parallelOp) {
+ mapParallelOp(parallelOp, topLevel, policy);
+ });
}
}
};
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
index 395987317a1e6..b313ab69cc001 100644
--- a/mlir/test/Dialect/GPU/mapping.mlir
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s --check-prefix=OUTER
+// RUN: mlir-opt -gpu-map-parallel-loops="mapping-policy=innermost-first" -split-input-file %s | FileCheck %s --check-prefix=INNER
func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index) {
@@ -14,14 +15,23 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
return
}
-// CHECK-LABEL: func @parallel_loop(
-// CHECK: scf.parallel
-// CHECK: scf.parallel
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK-NOT: mapping
+// OUTER-LABEL: func @parallel_loop(
+// OUTER: scf.parallel
+// OUTER: scf.parallel
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER-NOT: mapping
+
+// INNER-LABEL: func @parallel_loop(
+// INNER: scf.parallel
+// INNER: scf.parallel
+// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER: {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER-NOT: mapping
// -----
@@ -42,20 +52,38 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
return
}
-// CHECK-LABEL: func @parallel_loop_4d(
-// CHECK: scf.parallel
-// CHECK: scf.parallel
-// CHECK: scf.parallel
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK-NOT: mapping
+// OUTER-LABEL: func @parallel_loop_4d(
+// OUTER: scf.parallel
+// OUTER: scf.parallel
+// OUTER: scf.parallel
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER-NOT: mapping
+
+// INNER-LABEL: func @parallel_loop_4d(
+// INNER: scf.parallel
+// INNER: scf.parallel
+// INNER: scf.parallel
+// INNER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER: {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER-NOT: mapping
>From 28babcc94b635bb3b37c5f550464f8ba12e635a2 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas at arm.com>
Date: Thu, 25 Sep 2025 11:35:24 +0100
Subject: [PATCH 2/2] [mlir][gpu] Add innermost-first policy when mapping loops
to GPU IDs
Address review comments
Signed-off-by: Georgios Pinitas <georgios.pinitas at arm.com>
---
mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index 7b335d1d5f224..1afca9d180e22 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -137,9 +137,10 @@ mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid,
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
attrs.reserve(numLoops);
- for (int i = 0, e = numLoops; i < e; ++i) {
+ for (int i = 0; i < numLoops; ++i) {
// Determine the mapping to use for this loop.
+ // If the are more loops to map than HW IDs map to map to sequential.
int hwMapping = kNumHardwareIds;
if (i < loopsToMap) {
hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst)
More information about the Mlir-commits
mailing list