[Mlir-commits] [mlir] [mlir][gpu] Add innermost-first policy when mapping loops to GPU IDs (PR #160634)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Sep 24 20:24:56 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir
Author: Georgios Pinitas (GeorgeARM)
<details>
<summary>Changes</summary>
Extend the `GpuMapParallelLoopsPass` with a new `--mapping-policy` option. While the existing behavior is retained as the default option namely `outermost-first`; support for a new policy is added `innermost-first` that maps the loops in reverse order.
---
Full diff: https://github.com/llvm/llvm-project/pull/160634.diff
3 Files Affected:
- (modified) mlir/include/mlir/Dialect/GPU/Transforms/Passes.td (+11)
- (modified) mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp (+52-10)
- (modified) mlir/test/Dialect/GPU/mapping.mlir (+54-26)
``````````diff
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 187ac9aa18aac..0c8a0c7a677ab 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -39,8 +39,19 @@ def GpuMapParallelLoopsPass
encountered to the local workgroup. Within each mapping, the first three
dimensions are mapped to x/y/z hardware ids and all following dimensions are
mapped to sequential loops.
+
+ Ordering of the loop mapping against the different dimensions is controlled
+ by the `mapping-policy` option.
+ Two policies are supported:
+ 1. `outermost-first` (default): the outermost loop maps to X, then Y
+ and finally Z.
+ 2. `innermost-first`: the innermost loop maps to X, then Y and finally Z.
}];
let dependentDialects = ["mlir::gpu::GPUDialect"];
+ let options = [Option<"mappingPolicyStr", "mapping-policy", "std::string",
+ /*default=*/"\"outermost-first\"",
+ "Policy outlining how to assign loops to GPU dimensions."
+ "Supported values are `outermost-first` and `innermost-first`.">];
}
def GpuEliminateBarriers
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
index a098e721303a8..7b335d1d5f224 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -52,6 +52,7 @@ gpu::setMappingAttr(ParallelOp ploopOp,
namespace gpu {
namespace {
enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
+enum class MappingPolicy { OutermostFirst, InnermostFirst };
} // namespace
static constexpr int kNumHardwareIds = 3;
@@ -65,16 +66,29 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) {
return mappingLevel;
}
+// Map the policy string to a typed mapping policy.
+static FailureOr<MappingPolicy> getMappingPolicyFromStr(StringRef policy) {
+ std::string policyCanonical = policy.trim().lower();
+
+ auto option =
+ llvm::StringSwitch<std::optional<MappingPolicy>>(policyCanonical)
+ .Case("innermost-first", MappingPolicy::InnermostFirst)
+ .Case("outermost-first", MappingPolicy::OutermostFirst)
+ .Default(std::nullopt);
+
+ if (!option)
+ return failure();
+ return *option;
+}
+
/// Computed the hardware id to use for a given mapping level. Will
/// assign x,y and z hardware ids for the first 3 dimensions and use
/// sequential after.
-/// TODO: Make this use x for the inner-most loop that is
-/// distributed to map to x, the next innermost to y and the next innermost to
-/// z.
static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
if (dimension >= kNumHardwareIds || level == Sequential)
return Processor::Sequential;
+
switch (level) {
case MapGrid:
switch (dimension) {
@@ -107,20 +121,34 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) {
/// Add mapping information to the given parallel loop. Do not add
/// mapping information if the loop already has it. Also, don't
/// start a mapping at a nested loop.
-static void mapParallelOp(ParallelOp parallelOp,
- MappingLevel mappingLevel = MapGrid) {
+static void
+mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid,
+ MappingPolicy mappingPolicy = MappingPolicy::OutermostFirst) {
// Do not try to add a mapping to already mapped loops or nested loops.
if (parallelOp->getAttr(getMappingAttrName()) ||
((mappingLevel == MapGrid) && parallelOp->getParentOfType<ParallelOp>()))
return;
+ const int numLoops = static_cast<int>(parallelOp.getNumLoops());
+ const int loopsToMap = std::min(numLoops, kNumHardwareIds);
+
MLIRContext *ctx = parallelOp.getContext();
Builder b(ctx);
SmallVector<ParallelLoopDimMappingAttr, 4> attrs;
- attrs.reserve(parallelOp.getNumLoops());
- for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) {
+ attrs.reserve(numLoops);
+
+ for (int i = 0, e = numLoops; i < e; ++i) {
+
+ // Determine the mapping to use for this loop.
+ int hwMapping = kNumHardwareIds;
+ if (i < loopsToMap) {
+ hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst)
+ ? i
+ : (loopsToMap - 1 - i);
+ }
+
attrs.push_back(b.getAttr<ParallelLoopDimMappingAttr>(
- getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
+ getHardwareIdForMapping(mappingLevel, hwMapping), b.getDimIdentityMap(),
b.getDimIdentityMap()));
}
(void)setMappingAttr(parallelOp, attrs);
@@ -129,16 +157,30 @@ static void mapParallelOp(ParallelOp parallelOp,
// walk but just iterate over the operations.
for (Operation &op : *parallelOp.getBody()) {
if (ParallelOp nested = dyn_cast<ParallelOp>(op))
- mapParallelOp(nested, mappingLevel);
+ mapParallelOp(nested, mappingLevel, mappingPolicy);
}
}
namespace {
struct GpuMapParallelLoopsPass
: public impl::GpuMapParallelLoopsPassBase<GpuMapParallelLoopsPass> {
+ using Base::Base;
+
void runOnOperation() override {
+ // Parse the mapping policy.
+ auto policyOrFailure = getMappingPolicyFromStr(mappingPolicyStr);
+ if (failed(policyOrFailure)) {
+ getOperation()->emitError() << "Invalid mapping policy specified.";
+ return signalPassFailure();
+ }
+
+ auto policy = *policyOrFailure;
+ MappingLevel topLevel = MappingLevel::MapGrid;
+
for (Region ®ion : getOperation()->getRegions()) {
- region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); });
+ region.walk([&](ParallelOp parallelOp) {
+ mapParallelOp(parallelOp, topLevel, policy);
+ });
}
}
};
diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir
index 395987317a1e6..b313ab69cc001 100644
--- a/mlir/test/Dialect/GPU/mapping.mlir
+++ b/mlir/test/Dialect/GPU/mapping.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s --check-prefix=OUTER
+// RUN: mlir-opt -gpu-map-parallel-loops="mapping-policy=innermost-first" -split-input-file %s | FileCheck %s --check-prefix=INNER
func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
%arg3 : index) {
@@ -14,14 +15,23 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
return
}
-// CHECK-LABEL: func @parallel_loop(
-// CHECK: scf.parallel
-// CHECK: scf.parallel
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK-NOT: mapping
+// OUTER-LABEL: func @parallel_loop(
+// OUTER: scf.parallel
+// OUTER: scf.parallel
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER-NOT: mapping
+
+// INNER-LABEL: func @parallel_loop(
+// INNER: scf.parallel
+// INNER: scf.parallel
+// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER: {mapping = [#gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER-NOT: mapping
// -----
@@ -42,20 +52,38 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index,
return
}
-// CHECK-LABEL: func @parallel_loop_4d(
-// CHECK: scf.parallel
-// CHECK: scf.parallel
-// CHECK: scf.parallel
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
-// CHECK-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
-// CHECK-NOT: mapping
+// OUTER-LABEL: func @parallel_loop_4d(
+// OUTER: scf.parallel
+// OUTER: scf.parallel
+// OUTER: scf.parallel
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER: {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// OUTER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// OUTER-NOT: mapping
+
+// INNER-LABEL: func @parallel_loop_4d(
+// INNER: scf.parallel
+// INNER: scf.parallel
+// INNER: scf.parallel
+// INNER: {mapping = [#gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER: {mapping = [#gpu.loop_dim_map<processor = thread_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER: {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>,
+// INNER-SAME: #gpu.loop_dim_map<processor = sequential, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+// INNER-NOT: mapping
``````````
</details>
https://github.com/llvm/llvm-project/pull/160634
More information about the Mlir-commits
mailing list