[Mlir-commits] [mlir] [mlir][gpu] Prevent parallelization of reduction loops in affine-for-to-gpu (PR #176656)

Sun Jan 18 08:28:39 PST 2026

https://github.com/Men-cotton created https://github.com/llvm/llvm-project/pull/176656

Guard the conversion in `convert-affine-for-to-gpu` with `affine::isLoopParallel`. Prevent data races and miscompilation caused by mapping sequential reduction loops to parallel GPU dimensions.

Verify parallelism for the first `numBlockDims + numThreadDims` loops. Traverse only the directly nested loop chain to match the converter's expectations. Skip the conversion if an invalid mapping is detected, while allowing unmapped inner sequential loops to remain intact.

Link `MLIRAffineAnalysis` to `MLIRSCFToGPU` to handle the new dependency.

Add regression tests to `mlir/test/Conversion/SCFToGPU/reduction-loop.mlir`.

Fix: https://github.com/llvm/llvm-project/issues/139221


>From 3285f6ea5263a19668c72c6d7afc653f1847dc01 Mon Sep 17 00:00:00 2001
From: mencotton <mencotton0410 at gmail.com>
Date: Sun, 18 Jan 2026 00:19:25 +0900
Subject: [PATCH] [mlir][gpu] Prevent parallelization of reduction loops in
 affine-for-to-gpu

---
 mlir/lib/Conversion/SCFToGPU/CMakeLists.txt   |   1 +
 mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp |  23 ++++
 .../SCFToGPU/no_blocks_no_threads.mlir        |   9 +-
 .../Conversion/SCFToGPU/reduction-loop.mlir   | 105 ++++++++++++++++++
 mlir/test/Conversion/SCFToGPU/step_one.mlir   |  13 +--
 .../Conversion/SCFToGPU/step_positive.mlir    |   8 +-
 6 files changed, 143 insertions(+), 16 deletions(-)
 create mode 100644 mlir/test/Conversion/SCFToGPU/reduction-loop.mlir

diff --git a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
index b7853634bc44e..f8112bbdd548f 100644
--- a/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/SCFToGPU/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_conversion_library(MLIRSCFToGPU
   MLIRConversionPassIncGen
 
   LINK_LIBS PUBLIC
+  MLIRAffineAnalysis
   MLIRAffineDialect
   MLIRAffineToStandard
   MLIRArithDialect
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
index c816356cf3f96..4c1fd59bb9042 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"
 
 #include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
+#include "mlir/Dialect/Affine/Analysis/AffineAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
@@ -29,11 +30,33 @@ namespace {
 struct ForLoopMapper
     : public impl::ConvertAffineForToGPUPassBase<ForLoopMapper> {
   using Base::Base;
+  /// Checks if the loop nest rooted at 'forOp' has parallelism for the first
+  /// `numMappedLoops` loops.
+  static bool areMappedLoopsParallel(affine::AffineForOp forOp,
+                                     unsigned numMappedLoops) {
+    affine::AffineForOp currentLoop = forOp;
+    for (unsigned i = 0; i < numMappedLoops; ++i) {
+      if (!affine::isLoopParallel(currentLoop))
+        return false;
+      if (i + 1 < numMappedLoops) {
+        auto nestedLoops = currentLoop.getBody()->getOps<affine::AffineForOp>();
+        if (nestedLoops.empty())
+          // Return true here to let the conversion fail later on structural
+          // mismatch if the nest is not deep enough.
+          return true;
+        // Target only the first nested loop in a perfect nest.
+        currentLoop = *nestedLoops.begin();
+      }
+    }
+    return true;
+  }
 
   void runOnOperation() override {
     for (Operation &op : llvm::make_early_inc_range(
              getOperation().getFunctionBody().getOps())) {
       if (auto forOp = dyn_cast<affine::AffineForOp>(&op)) {
+        if (!areMappedLoopsParallel(forOp, numBlockDims + numThreadDims))
+          continue;
         if (failed(convertAffineLoopNestToGPULaunch(forOp, numBlockDims,
                                                     numThreadDims)))
           signalPassFailure();
diff --git a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
index 79eef8ae7eb85..107a6c3838010 100644
--- a/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
+++ b/mlir/test/Conversion/SCFToGPU/no_blocks_no_threads.mlir
@@ -21,14 +21,13 @@ func.func @one_d_loop(%A : memref<?xf32>, %B : memref<?xf32>) {
   // CHECK-BLOCKS-NEXT: gpu.launch blocks(%[[B0:.*]], %[[B1:.*]], %[[B2:.*]]) in (%{{.*}} = %[[BOUND]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]]) threads(%[[T0:.*]], %[[T1:.*]], %[[T2:.*]]) in (%{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]], %{{.*}} = %[[ONE]])
   affine.for %i = 0 to 42 {
   // CHECK-THREADS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[T0]]
-  // CHECK-THREADS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
+  // CHECK-THREADS-NEXT: affine.load %{{.*}}[%[[INDEX]]]
   // CHECK-BLOCKS-NEXT: %[[INDEX:.*]] = arith.addi %{{.*}}, %[[B0]]
-  // CHECK-BLOCKS-NEXT: memref.load %{{.*}}[%[[INDEX]]]
-    %0 = memref.load %A[%i] : memref<?xf32>
-    memref.store %0, %B[%i] : memref<?xf32>
+  // CHECK-BLOCKS-NEXT: affine.load %{{.*}}[%[[INDEX]]]
+    %0 = affine.load %A[%i] : memref<?xf32>
+    affine.store %0, %B[%i] : memref<?xf32>
     // CHECK-THREADS: gpu.terminator
     // CHECK-BLOCKS: gpu.terminator
   }
   return
 }
-
diff --git a/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir b/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir
new file mode 100644
index 0000000000000..16770ccba58c3
--- /dev/null
+++ b/mlir/test/Conversion/SCFToGPU/reduction-loop.mlir
@@ -0,0 +1,105 @@
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-affine-for-to-gpu{gpu-block-dims=1 gpu-thread-dims=1}))" %s | FileCheck %s
+
+/// Test parallelization legality checks in affine-for-to-gpu conversion.
+/// The pass is configured to map the first 2 loops to GPU block (depth 0) and
+/// GPU thread (depth 1) respectively.
+
+// CHECK-LABEL: func @map_to_gpu_inner_dep_unmapped
+// CHECK-SAME: %[[MEM:.*]]: memref<10x10x10xf32>
+func.func @map_to_gpu_inner_dep_unmapped(%mem: memref<10x10x10xf32>) {
+  /// The inner loop 'k' (depth=2) carries a dependency. However, since the
+  /// mapping only covers depth 0 and 1, 'k' remains sequential inside the
+  /// GPU kernel. The outer loops are dependency-free and safe to map.
+
+  // CHECK: gpu.launch
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 10 {
+      // CHECK: affine.for %{{.*}} = 1 to 10
+      // CHECK: affine.load %[[MEM]]
+      // CHECK: affine.store %{{.*}}, %[[MEM]]
+      affine.for %k = 1 to 10 {
+         %0 = affine.load %mem[%i, %j, %k - 1] : memref<10x10x10xf32>
+         affine.store %0, %mem[%i, %j, %k] : memref<10x10x10xf32>
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @negative_map_to_gpu_block_dep
+func.func @negative_map_to_gpu_block_dep(%mem: memref<10xf32>) {
+  /// The loop 'i' is mapped to a block dimension (depth=0).
+  /// The loop-carried dependency makes parallelization unsafe.
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i = 1 to 10 {
+     %0 = affine.load %mem[%i - 1] : memref<10xf32>
+     affine.store %0, %mem[%i] : memref<10xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @negative_map_to_gpu_thread_dep
+func.func @negative_map_to_gpu_thread_dep(%mem: memref<10x10xf32>) {
+  /// The inner loop 'j' is mapped to a thread dimension (depth=1).
+  /// A dependency in any mapped loop invalidates the entire nest conversion.
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i = 0 to 10 {
+    // CHECK: affine.for
+    affine.for %j = 1 to 10 {
+       %0 = affine.load %mem[%i, %j - 1] : memref<10x10xf32>
+       affine.store %0, %mem[%i, %j] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @negative_map_to_gpu_imperfect_nest_dep
+func.func @negative_map_to_gpu_imperfect_nest_dep(%mem: memref<10x10xf32>) {
+  /// Imperfect nest: The first inner loop 'j' has a dependency and is mapped
+  /// to a thread dimension. This prevents parallelization of the parent loop.
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i = 0 to 10 {
+    // CHECK: affine.for
+    affine.for %j = 1 to 10 {
+       %0 = affine.load %mem[%i, %j - 1] : memref<10x10xf32>
+       affine.store %0, %mem[%i, %j] : memref<10x10xf32>
+    }
+    // CHECK: affine.for
+    affine.for %k = 0 to 10 {
+       %1 = affine.load %mem[%i, %k] : memref<10x10xf32>
+       affine.store %1, %mem[%i, %k] : memref<10x10xf32>
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @mixed_parallel_and_seq_siblings
+func.func @mixed_parallel_and_seq_siblings(%mem: memref<10x10xf32>) {
+  /// Sibling top-level loops are analyzed independently. The first nest is
+  /// safe; the second has a dependency in a mapped loop (thread dim).
+
+  // CHECK: gpu.launch
+  affine.for %i = 0 to 10 {
+    affine.for %j = 0 to 10 {
+       %0 = affine.load %mem[%i, %j] : memref<10x10xf32>
+       affine.store %0, %mem[%i, %j] : memref<10x10xf32>
+    }
+  }
+
+  // CHECK-NOT: gpu.launch
+  // CHECK: affine.for
+  affine.for %i2 = 0 to 10 {
+    // CHECK: affine.for
+    affine.for %j2 = 1 to 10 {
+       %1 = affine.load %mem[%i2, %j2 - 1] : memref<10x10xf32>
+       affine.store %1, %mem[%i2, %j2] : memref<10x10xf32>
+    }
+  }
+  return
+}
diff --git a/mlir/test/Conversion/SCFToGPU/step_one.mlir b/mlir/test/Conversion/SCFToGPU/step_one.mlir
index be6fadfbd0ad3..3f8a1a847bc22 100644
--- a/mlir/test/Conversion/SCFToGPU/step_one.mlir
+++ b/mlir/test/Conversion/SCFToGPU/step_one.mlir
@@ -64,12 +64,12 @@ func.func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
           // CHECK-22-NEXT:   %[[jj:.*]] = arith.addi %{{.*}}, %{{.*}} : index
 
           // Using remapped values instead of loop iterators.
-          // CHECK-11:        {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          // CHECK-22:        {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          %0 = memref.load %A[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
-          // CHECK-11-NEXT:   memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          // CHECK-22-NEXT:   memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
-          memref.store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
+          // CHECK-11:        {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          // CHECK-22:        {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          %0 = affine.load %A[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
+          // CHECK-11-NEXT:   affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          // CHECK-22-NEXT:   affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]], %[[ii]], %[[jj]]] : memref<?x?x?x?xf32>
+          affine.store %0, %B[%i, %j, %ii, %jj] : memref<?x?x?x?xf32>
 
           // CHECK-11: gpu.terminator
           // CHECK-22: gpu.terminator
@@ -79,4 +79,3 @@ func.func @step_1(%A : memref<?x?x?x?xf32>, %B : memref<?x?x?x?xf32>) {
   }
   return
 }
-
diff --git a/mlir/test/Conversion/SCFToGPU/step_positive.mlir b/mlir/test/Conversion/SCFToGPU/step_positive.mlir
index 84e8454e56171..4da3a458a9152 100644
--- a/mlir/test/Conversion/SCFToGPU/step_positive.mlir
+++ b/mlir/test/Conversion/SCFToGPU/step_positive.mlir
@@ -18,10 +18,10 @@ func.func @step_var(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
       // CHECK-NEXT: %[[prod_j:.*]] = arith.muli %{{.*}}, %{{.*}} : index
       // CHECK-NEXT: %[[j:.*]] = arith.addi %{{.*}}, %[[prod_j]] : index
 
-      // CHECK:     {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
-      %0 = memref.load %A[%i, %j] : memref<?x?xf32>
-      // CHECK:     memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
-      memref.store %0, %B[%i, %j] : memref<?x?xf32>
+      // CHECK:     {{.*}} = affine.load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
+      %0 = affine.load %A[%i, %j] : memref<?x?xf32>
+      // CHECK:     affine.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
+      affine.store %0, %B[%i, %j] : memref<?x?xf32>
     }
   }
   return