[Mlir-commits] [mlir] [MLIR][LLVMIR] Fix inline byval alloca hoisting out of allocation scope (PR #185399)

Mon Mar 9 05:27:30 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-mlir-llvm

Author: Berke Ates (Berke-Ates)

<details>
<summary>Changes</summary>

This PR fixes a bug of the LLVM dialect's function inlining, which materializes allocas for byval arguments and moves them to entry block for optimization. This optimization should not cross allocation scopes, as it leads to data races in parallel regions. See the example below.

```mlir
// Optimize with mlir-opt --pass-pipeline="builtin.module(inline)" to trigger the inlining bug.

module {
    // runner calls kernel(thread_idx, byval_struct, output)
    // After inlining, the alloca for the byval struct gets placed in the outer
    // region (shared by all threads) instead of inside the scf.forall body.
    // This is a bug: each thread should have its own copy, but they all share
    // the same alloca.
    llvm.func hidden @runner(%arg0: !llvm.ptr {llvm.byval = !llvm.struct<(f32)>}, %arg1: !llvm.ptr) {
        %c0 = arith.constant 0 : index
        %c1024 = arith.constant 1024 : index
        %c1 = arith.constant 1 : index
        scf.forall (%i) = (%c0) to (%c1024) step (%c1) {
            %idx = arith.index_cast %i : index to i64
            llvm.call @kernel(%idx, %arg0, %arg1) : (i64, !llvm.ptr, !llvm.ptr) -> ()
        }
        llvm.return
    }
    // kernel uses the byval struct as writable thread-local scratch space:
    // 1. Writes thread_id into the local copy
    // 2. Reads it back and stores to output
    // After buggy inlining, all threads share the same alloca, so thread A's
    // store is overwritten by thread B before thread A reads it back.
    // Expected: output[i] == float(i). Actual: non-deterministic.
    llvm.func hidden @kernel(%arg0: i64, %arg1: !llvm.ptr {llvm.byval = !llvm.struct<(f32)>}, %arg2: !llvm.ptr) {
        %thread_id_f32 = llvm.sitofp %arg0 : i64 to f32
        // BUG: after inlining, %arg1 is a shared alloca, concurrent threads
        // overwrite each other between the store and load below.
        llvm.store %thread_id_f32, %arg1 : f32, !llvm.ptr
        %val = llvm.load %arg1 : !llvm.ptr -> f32
        %out_ptr = llvm.getelementptr inbounds %arg2[%arg0] : (!llvm.ptr, i64) -> !llvm.ptr, f32
        llvm.store %val, %out_ptr : f32, !llvm.ptr
        llvm.return
    }
}
```

---
Full diff: https://github.com/llvm/llvm-project/pull/185399.diff


2 Files Affected:

- (modified) mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp (+19-3) 
- (modified) mlir/test/Dialect/LLVMIR/inlining.mlir (+27) 


``````````diff

diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
index 0e43480e82926..baaae51ef5a21 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/InlinerInterfaceImpl.cpp
@@ -603,10 +603,26 @@ static Value handleByValArgumentInit(OpBuilder &builder, Location loc,
   // Allocate the new value on the stack.
   Value allocaOp;
   {
-    // Since this is a static alloca, we can put it directly in the entry block,
-    // so they can be absorbed into the prologue/epilogue at code generation.
+    // Walk up from the call site to find the innermost AutomaticAllocationScope
+    // (e.g. an llvm.func or scf.forall). Placing the alloca at the entry block
+    // of that scope keeps it inside parallel regions rather than hoisting it
+    // out, while still landing at the function entry block for the common
+    // non-parallel case.
     OpBuilder::InsertionGuard insertionGuard(builder);
-    Block *entryBlock = &(*argument.getParentRegion()->begin());
+    Block *entryBlock = nullptr;
+    Block *cursor = builder.getInsertionBlock();
+    while (cursor) {
+      Operation *parentOp = cursor->getParentOp();
+      if (!parentOp)
+        break;
+      if (parentOp->hasTrait<OpTrait::AutomaticAllocationScope>()) {
+        entryBlock = &cursor->getParent()->front();
+        break;
+      }
+      cursor = parentOp->getBlock();
+    }
+    if (!entryBlock)
+      entryBlock = &(*argument.getParentRegion()->begin());
     builder.setInsertionPointToStart(entryBlock);
     Value one = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
                                          builder.getI64IntegerAttr(1));
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 70ce7ca20986b..e84a4a45ca45b 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -570,6 +570,33 @@ llvm.func @test_byval_global() {
 
 // -----
 
+// Check that inlining does not hoist byval allocas out of automatic allocation
+// scopes, such as parallel forall regions. Each parallel iteration must have
+// its own private copy of the byval argument.
+
+llvm.func @byval_in_parallel(%ptr : !llvm.ptr { llvm.byval = f32 }) {
+  llvm.return
+}
+
+// CHECK-LABEL: llvm.func @test_byval_in_parallel_region
+// CHECK-SAME: %[[PTR:[a-zA-Z0-9_]+]]: !llvm.ptr
+llvm.func @test_byval_in_parallel_region(%ptr : !llvm.ptr) {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1 = arith.constant 1 : index
+  // Verify the alloca is not hoisted out of the parallel region.
+  // CHECK-NOT: llvm.alloca
+  // CHECK: scf.forall
+  scf.forall (%i) = (%c0) to (%c4) step (%c1) {
+    // CHECK: %[[ALLOCA:.+]] = llvm.alloca %{{.+}} x f32
+    // CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[PTR]]
+    llvm.call @byval_in_parallel(%ptr) : (!llvm.ptr) -> ()
+  }
+  llvm.return
+}
+
+// -----
+
 llvm.func @ignored_attrs(%ptr : !llvm.ptr { llvm.inreg, llvm.nocapture, llvm.nofree, llvm.preallocated = i32, llvm.returned, llvm.alignstack = 32 : i64, llvm.writeonly, llvm.noundef, llvm.nonnull }, %x : i32 { llvm.zeroext }) -> (!llvm.ptr { llvm.noundef, llvm.inreg, llvm.nonnull }) {
   llvm.return %ptr : !llvm.ptr
 }

``````````

</details>


https://github.com/llvm/llvm-project/pull/185399