[flang-commits] [flang] [flang][OpenMP] Fix crash when a sliced array is specified in a forall within a workshare construct (PR #170913)
Carlos Seo via flang-commits
flang-commits at lists.llvm.org
Fri Feb 13 11:08:19 PST 2026
https://github.com/ceseo updated https://github.com/llvm/llvm-project/pull/170913
>From 44b74cb969c9bc821de28ce529503e08b8ba4d51 Mon Sep 17 00:00:00 2001
From: Carlos Seo <carlos.seo at linaro.org>
Date: Fri, 5 Dec 2025 19:55:22 +0000
Subject: [PATCH] [flang][OpenMP] Fix crash when a sliced array is specified in
forall construct within workshare construct
This is fix for two problems:
1. Thread-local variables sometimes are required to be paralellized. Added a
special case to handle this in LowerWorkshare.cpp:isSafeToParallelize.
A new helper function (isOpenMPThreadLocalMemory) was created to determine
if a memory reference is thread-local:
* Best-effort analysis to determine if code is inside a parallel region.
* Two possibilities for thread-local memory:
a. Memory allocated via fir::AllocaOp inside the enclosing omp.parallel
region;
b. Memory from OpenMP clause block arguments that create TLS (private,
firstprivate, lastprivate, reduction, linear).
2. Race condition caused by a `nowait` added to the omp.workshare if it is the
last operation in a block. This allowed multiple threads to execute the
omp.workshare region concurrently. Since _FortranAPushValue modifies a
shared stack, this concurrent access causes a crash. Disable the addition of
`nowait` and rely on the implicit barrier at the the of the omp.workshare
region.
Fixes #143330
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 101 ++++++-
.../OpenMP/workshare-forall-sliced-array.f90 | 57 ++++
.../OpenMP/lower-workshare-thread-local.mlir | 265 ++++++++++++++++++
3 files changed, 421 insertions(+), 2 deletions(-)
create mode 100644 flang/test/Integration/OpenMP/workshare-forall-sliced-array.f90
create mode 100644 flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index f6af684f06edb..457b74d84a81f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -16,6 +16,7 @@
//
//===----------------------------------------------------------------------===//
+#include <flang/Optimizer/Analysis/AliasAnalysis.h>
#include <flang/Optimizer/Builder/FIRBuilder.h>
#include <flang/Optimizer/Dialect/FIROps.h>
#include <flang/Optimizer/Dialect/FIRType.h>
@@ -37,6 +38,7 @@
#include <mlir/IR/PatternMatch.h>
#include <mlir/IR/Value.h>
#include <mlir/IR/Visitors.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include <mlir/Support/LLVM.h>
@@ -135,9 +137,97 @@ static bool mustParallelizeOp(Operation *op) {
.wasInterrupted();
}
+// Determines if a memory reference is thread-local in an OpenMP context.
+//
+// This is a best-effort analysis. We cannot definitively determine if code
+// is inside a parallel region when it's in a function called from that
+// region. However, we can identify common patterns of thread-local memory:
+//
+// 1. Memory allocated via fir.alloca inside the enclosing omp.parallel region
+// 2. Memory that comes from OpenMP clause block arguments that create
+// thread-local storage (private, firstprivate, lastprivate, reduction,
+// linear clauses)
+//
+// Returns true if the memory reference appears to be thread-local and thus
+// safe to parallelize (each thread should access its own copy).
+static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
+ // Use AliasAnalysis to trace through declares, converts, reboxes, etc.
+ // to find the underlying source of the memory reference.
+ fir::AliasAnalysis aliasAnalysis;
+ fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
+
+ // Check if the source is a Value (not a global symbol).
+ mlir::Value sourceValue =
+ llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
+ if (!sourceValue)
+ return false;
+
+ // Case 1: Memory allocated by fir.alloca inside the enclosing parallel
+ // region is thread-private (each thread gets its own stack allocation).
+ if (source.kind == fir::AliasAnalysis::SourceKind::Allocate) {
+ if (auto alloca = sourceValue.getDefiningOp<fir::AllocaOp>()) {
+ if (auto parallelOp = alloca->getParentOfType<omp::ParallelOp>()) {
+ if (op->getParentOfType<omp::ParallelOp>() == parallelOp)
+ return true;
+ }
+ }
+ }
+
+ // Case 2: Memory from OpenMP clause block arguments that create thread-local
+ // storage. These clauses create private copies for each thread:
+ // - private: uninitialized thread-local copy
+ // - firstprivate: thread-local copy initialized from original
+ // - lastprivate: thread-local copy, value copied back after construct
+ // - reduction: thread-local copy for reduction operations
+ // - linear: thread-local copy with linear modification
+ //
+ // Check if the source value is a block argument of an OpenMP operation
+ // that implements BlockArgOpenMPOpInterface.
+ if (auto blockArg = llvm::dyn_cast<BlockArgument>(sourceValue)) {
+ Operation *parentOp = blockArg.getOwner()->getParentOp();
+ if (auto argIface =
+ llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
+ // Check if this block argument corresponds to a privatizing clause.
+ // Private, reduction, and in_reduction clauses create thread-local
+ // memory.
+ auto isInBlockArgs = [&](auto blockArgs) {
+ return llvm::is_contained(blockArgs, blockArg);
+ };
+
+ if (isInBlockArgs(argIface.getPrivateBlockArgs()))
+ return true;
+ if (isInBlockArgs(argIface.getReductionBlockArgs()))
+ return true;
+ if (isInBlockArgs(argIface.getInReductionBlockArgs()))
+ return true;
+ if (isInBlockArgs(argIface.getTaskReductionBlockArgs()))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static bool isSafeToParallelize(Operation *op) {
- return isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
- isMemoryEffectFree(op);
+ if (isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
+ isMemoryEffectFree(op))
+ return true;
+
+ // Thread-local variables allocated in the OpenMP parallel region or coming
+ // from privatizing clauses are private to each thread and thus safe (and
+ // sometimes required) to parallelize. If the compiler wraps such load/stores
+ // in an omp.single block, only one thread updates its local copy, while
+ // all other threads read uninitialized data (see issue #143330).
+ Value mem;
+ if (auto store = dyn_cast<fir::StoreOp>(op))
+ mem = store.getMemref();
+ else if (auto load = dyn_cast<fir::LoadOp>(op))
+ mem = load.getMemref();
+
+ if (mem && isOpenMPThreadLocalMemory(op, mem))
+ return true;
+
+ return false;
}
/// Simple shallow copies suffice for our purposes in this pass, so we implement
@@ -335,6 +425,13 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
bool isLast = i + 1 == regions.size();
+ // Make sure shared runtime calls are synchronized: disable `nowait`
+ // insertion, and rely on the implicit barrier at the end of the
+ // omp.workshare block. This applies to any loop-like operation
+ // (fir.do_loop, fir.iterate_while, fir.do_concurrent.loop, etc.)
+ // because iterations could overlap if nowait is used.
+ if (isa<LoopLikeOpInterface>(block.getParentOp()))
+ isLast = false;
if (std::holds_alternative<SingleRegion>(opOrSingle)) {
OpBuilder singleBuilder(sourceRegion.getContext());
Block *singleBlock = new Block();
diff --git a/flang/test/Integration/OpenMP/workshare-forall-sliced-array.f90 b/flang/test/Integration/OpenMP/workshare-forall-sliced-array.f90
new file mode 100644
index 0000000000000..88d1062b091bf
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-forall-sliced-array.f90
@@ -0,0 +1,57 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+! Regression test for https://github.com/llvm/llvm-project/issues/143330
+! Verify that forall constructs with sliced arrays inside workshare are
+! correctly lowered without causing race conditions or crashes.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefix LLVM
+
+subroutine workshare_forall_sliced(a1)
+ integer :: a1(2,1,3)
+ !$omp parallel
+ !$omp workshare
+ forall (n1=1:1)
+ forall (n2=1:3)
+ a1(:,n1,n2) = a1(:,n1,n2)+1
+ end forall
+ end forall
+ !$omp end workshare
+ !$omp end parallel
+end subroutine
+
+! HLFIR-LABEL: func.func @_QPworkshare_forall_sliced
+! HLFIR: omp.parallel {
+! HLFIR: omp.workshare {
+! HLFIR: hlfir.forall
+! HLFIR: hlfir.forall
+! HLFIR: hlfir.region_assign
+! HLFIR: omp.terminator
+! HLFIR: }
+! HLFIR: omp.terminator
+! HLFIR: }
+
+! After workshare lowering, the forall should be in omp.single (since it
+! contains operations that are not safe to parallelize across threads).
+! The key fix is that this compiles without crashing and the runtime
+! executes correctly.
+
+! FIR-LABEL: func.func @_QPworkshare_forall_sliced
+! FIR: omp.parallel {
+! FIR: omp.single
+! FIR: fir.do_loop
+! FIR: fir.do_loop
+! FIR: omp.barrier
+! FIR: omp.terminator
+! FIR: }
+
+! Verify LLVM IR is generated successfully (the original issue caused crashes)
+! LLVM-LABEL: define {{.*}}workshare_forall_sliced
+! LLVM: call {{.*}}@__kmpc_fork_call
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
new file mode 100644
index 0000000000000..f6e848da4e3f4
--- /dev/null
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -0,0 +1,265 @@
+// RUN: fir-opt --split-input-file --lower-workshare --allow-unregistered-dialect %s | FileCheck %s
+
+// Tests for thread-local memory handling in workshare lowering (#143330):
+// 1. Thread-local variables (from fir.alloca in omp.parallel or from OpenMP
+// private/reduction clauses) should be parallelized, not wrapped in omp.single
+// 2. nowait should not be added to omp.single when inside loop-like operations
+// that contain omp.workshare.loop_wrapper
+
+
+// Check that fir.alloca inside omp.parallel creates thread-local memory,
+// and stores to it should be parallelized (not wrapped in omp.single).
+
+// CHECK-LABEL: func.func @thread_local_alloca_store
+func.func @thread_local_alloca_store() {
+ omp.parallel {
+ // The alloca is inside omp.parallel, so it's thread-local
+ %alloca = fir.alloca i32
+ omp.workshare {
+ %c1 = arith.constant 1 : i32
+ // This store should NOT be in omp.single because %alloca is thread-local
+ fir.store %c1 to %alloca : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+
+
+// Check that memory accessed through fir.declare is also recognized as thread-local
+// when the underlying alloca is in the parallel region.
+
+// CHECK-LABEL: func.func @thread_local_with_declare
+func.func @thread_local_with_declare() {
+ omp.parallel {
+ %alloca = fir.alloca i32
+ %declare = fir.declare %alloca {uniq_name = "local_var"} : (!fir.ref<i32>) -> !fir.ref<i32>
+ omp.workshare {
+ %c42 = arith.constant 42 : i32
+ // Store through declare should still be recognized as thread-local
+ fir.store %c42 to %declare : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK-NEXT: %[[DECLARE:.*]] = fir.declare %[[ALLOCA]]
+// CHECK-NEXT: %[[C42:.*]] = arith.constant 42 : i32
+// CHECK-NEXT: fir.store %[[C42]] to %[[DECLARE]] : !fir.ref<i32>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+
+
+// Check that private clause block arguments are recognized as thread-local.
+
+omp.private {type = private} @x_private : i32
+
+// CHECK-LABEL: func.func @private_clause_thread_local
+func.func @private_clause_thread_local(%arg0: !fir.ref<i32>) {
+ omp.parallel private(@x_private %arg0 -> %priv_arg : !fir.ref<i32>) {
+ omp.workshare {
+ %c10 = arith.constant 10 : i32
+ // Store to private variable should NOT be in omp.single
+ fir.store %c10 to %priv_arg : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel private(@x_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : i32
+// CHECK-NEXT: fir.store %[[C10]] to %[[PRIV_ARG]] : !fir.ref<i32>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+
+
+// Check that reduction clause block arguments are recognized as thread-local.
+
+omp.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+ %c0 = arith.constant 0 : i32
+ omp.yield(%c0 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %0 = arith.addi %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+}
+
+// CHECK-LABEL: func.func @reduction_clause_thread_local
+func.func @reduction_clause_thread_local(%arg0: !fir.ref<i32>) {
+ omp.parallel reduction(@add_reduction_i32 %arg0 -> %red_arg : !fir.ref<i32>) {
+ omp.workshare {
+ %c5 = arith.constant 5 : i32
+ // Store to reduction variable should NOT be in omp.single
+ fir.store %c5 to %red_arg : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel reduction(@add_reduction_i32 %{{.*}} -> %[[RED_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT: %[[C5:.*]] = arith.constant 5 : i32
+// CHECK-NEXT: fir.store %[[C5]] to %[[RED_ARG]] : !fir.ref<i32>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+
+
+// Check that nowait is NOT added to omp.single when inside fir.do_loop
+// that contains omp.workshare.loop_wrapper. This prevents race conditions
+// when multiple threads execute different loop iterations concurrently.
+// The workshare.loop_wrapper triggers recursive parallelization of the loop body.
+
+// CHECK-LABEL: func.func @no_nowait_in_loop_with_workshare_wrapper
+func.func @no_nowait_in_loop_with_workshare_wrapper(%arg0: !fir.ref<i32>) {
+ omp.parallel {
+ omp.workshare {
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ fir.do_loop %i = %c1 to %c10 step %c1 {
+ // This side-effecting op will be wrapped in omp.single without nowait
+ "test.side_effect"(%arg0) : (!fir.ref<i32>) -> ()
+ // The workshare.loop_wrapper triggers recursive processing of the loop
+ omp.workshare.loop_wrapper {
+ omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
+ "test.inner"() : () -> ()
+ omp.yield
+ }
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The omp.single inside the loop should NOT have nowait
+// CHECK: omp.parallel {
+// CHECK: fir.do_loop
+// CHECK: omp.single {
+// CHECK: "test.side_effect"
+// CHECK: omp.terminator
+// CHECK-NEXT: }
+// CHECK: omp.wsloop {
+// CHECK: }
+// CHECK: omp.barrier
+// CHECK: }
+
+
+// Check that thread-local store inside a loop with workshare.loop_wrapper
+// is correctly parallelized (not wrapped in omp.single).
+
+// CHECK-LABEL: func.func @thread_local_store_in_loop_with_wrapper
+func.func @thread_local_store_in_loop_with_wrapper() {
+ omp.parallel {
+ %alloca = fir.alloca i32
+ omp.workshare {
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ fir.do_loop %i = %c1 to %c10 step %c1 {
+ %c99 = arith.constant 99 : i32
+ // Store to thread-local alloca should NOT be in omp.single
+ fir.store %c99 to %alloca : !fir.ref<i32>
+ omp.workshare.loop_wrapper {
+ omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
+ "test.inner"() : () -> ()
+ omp.yield
+ }
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK: fir.do_loop
+// The store should be outside omp.single
+// CHECK: %[[C99:.*]] = arith.constant 99 : i32
+// CHECK-NEXT: fir.store %[[C99]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK: omp.wsloop {
+// CHECK: }
+// CHECK: omp.barrier
+// CHECK: }
+
+
+// Check that non-thread-local memory is still wrapped in omp.single.
+// This is the baseline case to ensure we haven't broken normal behavior.
+
+// CHECK-LABEL: func.func @non_thread_local_needs_single
+func.func @non_thread_local_needs_single(%arg0: !fir.ref<i32>) {
+ omp.parallel {
+ omp.workshare {
+ %c1 = arith.constant 1 : i32
+ // arg0 is shared memory, store must be in omp.single
+ fir.store %c1 to %arg0 : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel {
+// CHECK-NEXT: omp.single nowait {
+// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT: fir.store %[[C1]] to %{{.*}} : !fir.ref<i32>
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+
+
+// Check that loads from thread-local alloca combined with stores are parallelized.
+
+// CHECK-LABEL: func.func @thread_local_load_and_store
+func.func @thread_local_load_and_store() {
+ omp.parallel {
+ %alloca = fir.alloca i32
+ omp.workshare {
+ %c1 = arith.constant 1 : i32
+ fir.store %c1 to %alloca : !fir.ref<i32>
+ %val = fir.load %alloca : !fir.ref<i32>
+ fir.store %val to %alloca : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// All operations on thread-local memory should be outside omp.single
+
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[ALLOCA:.*]] = fir.alloca i32
+// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : i32
+// CHECK-NEXT: fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT: %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT: fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
More information about the flang-commits
mailing list