[Mlir-commits] [mlir] [mlir][gpu] Loose the condition to convert scf.parallel to gpu.launch (PR #164978)

Fri Oct 24 06:10:33 PDT 2025

https://github.com/Hsiangkai created https://github.com/llvm/llvm-project/pull/164978

Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.

>From 2e1c0045d0f8c2a658350f17c109ec71512d1c92 Mon Sep 17 00:00:00 2001
From: Hsiangkai Wang <hsiangkai.wang at arm.com>
Date: Fri, 24 Oct 2025 11:58:39 +0100
Subject: [PATCH] [mlir][gpu] Loose the condition to convert scf.parallel to
 gpu.launch

Use LocalAliasAnalysis to improve handling of side effects in nested
scf.parallel. If the written memory outside nested scf.parallel is not alias
to the memory accessed inside the nested loop, we can convert it to gpu.launch.
---
 mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp     | 52 ++++++++++++++++++-
 .../Conversion/SCFToGPU/parallel_loop.mlir    | 32 ++++++++++++
 2 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index 7d0a236b6f69a..9f977802f4404 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -14,6 +14,7 @@
 
 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
 
+#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -27,6 +28,7 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/DebugLog.h"
 #include <optional>
 
@@ -625,6 +627,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
   bool seenSideeffects = false;
   // Whether we have left a nesting scope (and hence are no longer innermost).
   bool leftNestingScope = false;
+  LocalAliasAnalysis aliasAnalysis;
+  llvm::DenseSet<Value> writtenBuffer;
   while (!worklist.empty()) {
     Operation *op = worklist.pop_back_val();
     // Now walk over the body and clone it.
@@ -635,8 +639,39 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
     if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
       // Before entering a nested scope, make sure there have been no
       // sideeffects until now.
-      if (seenSideeffects)
-        return failure();
+      if (seenSideeffects) {
+        // Go through all operations in the nested parallel and check if any
+        // of the side-effecting operations access buffers that have been
+        // written to in the outer scope.
+        bool accessesWrittenBuffer = false;
+        nestedParallel.walk([&](Operation *nestedOp) {
+          if (accessesWrittenBuffer)
+            return;
+          if (isMemoryEffectFree(nestedOp))
+            return;
+
+          if (auto memEffectInterface =
+                  dyn_cast<MemoryEffectOpInterface>(nestedOp)) {
+            SmallVector<MemoryEffects::EffectInstance> effects;
+            memEffectInterface.getEffects(effects);
+            for (const auto &effect : effects) {
+              if (isa<MemoryEffects::Read>(effect.getEffect()) ||
+                  isa<MemoryEffects::Write>(effect.getEffect())) {
+                Value baseBuffer = effect.getValue();
+                for (auto val : writtenBuffer) {
+                  if (aliasAnalysis.alias(baseBuffer, val) !=
+                      AliasResult::NoAlias) {
+                    accessesWrittenBuffer = true;
+                    return;
+                  }
+                }
+              }
+            }
+          }
+        });
+        if (accessesWrittenBuffer)
+          return failure();
+      }
       // A nested scf.parallel needs insertion of code to compute indices.
       // Insert that now. This will also update the worklist with the loops
       // body.
@@ -650,6 +685,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
       rewriter.setInsertionPointAfter(parent);
       leftNestingScope = true;
       seenSideeffects = false;
+      writtenBuffer.clear();
     } else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
       // Convert scf.reduction op
       auto parentLoop = op->getParentOfType<ParallelOp>();
@@ -682,6 +718,18 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
       Operation *clone = rewriter.clone(*op, cloningMap);
       cloningMap.map(op->getResults(), clone->getResults());
       // Check for side effects.
+      if (!isMemoryEffectFree(clone)) {
+        // Record the buffer accessed by the operations with write effects.
+        if (auto memEffectInterface =
+                dyn_cast<MemoryEffectOpInterface>(clone)) {
+          SmallVector<MemoryEffects::EffectInstance> effects;
+          memEffectInterface.getEffects(effects);
+          for (const auto &effect : effects) {
+            if (isa<MemoryEffects::Write>(effect.getEffect()))
+              writtenBuffer.insert(effect.getValue());
+          }
+        }
+      }
       // TODO: Handle region side effects properly.
       seenSideeffects |=
           !isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
index 1dbce05be85b4..26f5a3e1f0ac0 100644
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() {
 // CHECK: scf.parallel
 // CHECK-NEXT: scf.parallel
 // CHECK: scf.reduce
+
+// -----
+
+// CHECK-LABEL: @nested_parallel_with_side_effect
+func.func @nested_parallel_with_side_effect() {
+  %c65536 = arith.constant 65536 : index
+  %c2 = arith.constant 2 : index
+  %c256 = arith.constant 256 : index
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  %alloc_0 = memref.alloc() : memref<2x256x256xf32>
+  %alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
+  %alloc_2 = memref.alloc() : memref<4x4xf32>
+  %alloc_3 = memref.alloc() : memref<4x4xf32>
+  scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
+    %1 = arith.remsi %arg4, %c256 : index
+    %2 = arith.divsi %arg4, %c256 : index
+    %4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
+    memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
+    scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
+      %5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
+      memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
+      scf.reduce
+    } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+    scf.reduce
+  } {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+  return
+}
+
+// CHECK: gpu.launch
+// CHECK-NOT: scf.parallel