[flang-commits] [flang] db5cd62 - [flang][OpenMP] Restrict isSafeToParallelize to write-only thread-local effects (#188595)

Fri Mar 27 08:11:32 PDT 2026

Author: Carlos Seo
Date: 2026-03-27T12:11:27-03:00
New Revision: db5cd626b97a887cd237f9be9372cd3750fc7a02

URL: https://github.com/llvm/llvm-project/commit/db5cd626b97a887cd237f9be9372cd3750fc7a02
DIFF: https://github.com/llvm/llvm-project/commit/db5cd626b97a887cd237f9be9372cd3750fc7a02.diff

LOG: [flang][OpenMP] Restrict isSafeToParallelize to write-only thread-local effects (#188595)

This is a follow-up fix for commit 0f5e9bee.

Only write effects to thread-local memory should be considered safe to
parallelize in workshare lowering, not reads. When both reads and writes
were safe, the cascading effect in moveToSingle could cause entire
SingleRegions to become fully parallelized, eliminating the omp.single
and its implicit barrier. This removed synchronization points needed to
keep threads coordinated inside sequential loops containing workshared
operations, causing race conditions in forall-workshare patterns.

This was exposed by the Fujitsu Test Suite and made the following tests
regress:

FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0031.test
FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0013.test
FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0030.test
FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0014.test

Updates #143330

Added: 
    

Modified: 
    flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
    flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir

Removed: 
    


################################################################################
diff  --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index ee42801da09d9..a41d8d8826501 100644

--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -236,19 +236,25 @@ static bool isSafeToParallelize(Operation *op) {
 
   // Thread-local variables allocated in the OpenMP parallel region or coming
   // from privatizing clauses are private to each thread and thus safe (and
-  // sometimes required) to parallelize. If the compiler wraps such load/stores
-  // in an omp.single block, only one thread updates its local copy, while
-  // all other threads read uninitialized data (see issue #143330).
-  // Use MemoryEffectOpInterface to check all memory effects generically.
-  // This also handles hlfir.assign which is present when the pass runs before
-  // HLFIR lowering.
+  // sometimes required) to parallelize. If the compiler wraps stores to
+  // thread-local variables in an omp.single block, only one thread updates
+  // its local copy, while all other threads read uninitialized data (see
+  // issue #143330).
+  //
+  // Only WRITE effects to thread-local memory are considered safe here, not
+  // reads. If reads were also safe, the cascading effect in moveToSingle
+  // could cause entire SingleRegions to become fully parallelized (all ops
+  // safe), eliminating the omp.single and its implicit barrier. This removes
+  // synchronization points needed to keep threads coordinated inside
+  // sequential loops that contain workshared operations.
   if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
     SmallVector<MemoryEffects::EffectInstance> effects;
     memEffects.getEffects(effects);
     if (!effects.empty() &&
         llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) {
           Value val = effect.getValue();
-          return val && isOpenMPThreadLocalMemory(op, val);
+          return val && isa<MemoryEffects::Write>(effect.getEffect()) &&
+                 isOpenMPThreadLocalMemory(op, val);
         }))
       return true;
   }

diff  --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d046ef4cc46f7..d6000c989515b 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -119,6 +119,39 @@ func.func @hlfir_assign_private_clause(%arg0: !fir.ref<i32>) {
 // CHECK-NEXT:  }
 
 
+// -----
+
+// Check that hlfir.assign from a shared variable to a private variable is
+// NOT parallelized. The assign has a Read effect on the shared RHS, which
+// is not a write to thread-local memory, so isSafeToParallelize returns false.
+
+omp.private {type = private} @y_private : i32
+
+// CHECK-LABEL: func.func @hlfir_assign_shared_to_private
+func.func @hlfir_assign_shared_to_private(%arg0: !fir.ref<i32>, %shared: !fir.ref<i32>) {
+  omp.parallel private(@y_private %arg0 -> %priv_arg : !fir.ref<i32>) {
+    %decl:2 = hlfir.declare %priv_arg {uniq_name = "x"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    omp.workshare {
+      // hlfir.assign with a shared RHS variable should stay in omp.single
+      hlfir.assign %shared to %decl#0 : !fir.ref<i32>, !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK:       omp.parallel private(@y_private %{{.*}} -> %[[PRIV_ARG:.*]] : !fir.ref<i32>) {
+// CHECK-NEXT:    %[[DECL:.*]]:2 = hlfir.declare %[[PRIV_ARG]]
+// CHECK-NEXT:    omp.single nowait {
+// CHECK:           hlfir.assign %{{.*}} to %[[DECL]]#0 : !fir.ref<i32>, !fir.ref<i32>
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }
+
+
 // Check that reduction clause block arguments are recognized as thread-local.
 
 omp.declare_reduction @add_reduction_i32 : i32 init {
@@ -263,7 +296,9 @@ func.func @non_thread_local_needs_single(%arg0: !fir.ref<i32>) {
 // CHECK-NEXT:  }
 
 
-// Check that loads from thread-local alloca combined with stores are parallelized.
+// Check that stores to thread-local alloca are parallelized, but loads stay in
+// omp.single. Only writes to thread-local memory are safe to parallelize;
+// reads must remain in the single to preserve synchronization barriers.
 
 // CHECK-LABEL: func.func @thread_local_load_and_store
 func.func @thread_local_load_and_store() {
@@ -281,14 +316,90 @@ func.func @thread_local_load_and_store() {
   return
 }
 
-// All operations on thread-local memory should be outside omp.single
+// The store to thread-local memory is parallelized (outside the single),
+// but the load remains inside the single to maintain synchronization.
 
 // CHECK:       omp.parallel {
 // CHECK-NEXT:    %[[ALLOCA:.*]] = fir.alloca i32
-// CHECK-NEXT:    %[[C1:.*]] = arith.constant 1 : i32
-// CHECK-NEXT:    fir.store %[[C1]] to %[[ALLOCA]] : !fir.ref<i32>
-// CHECK-NEXT:    %[[VAL:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
-// CHECK-NEXT:    fir.store %[[VAL]] to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:         omp.single nowait {
+// CHECK:           fir.store {{.*}} to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           fir.load %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           fir.store {{.*}} to %[[ALLOCA]] : !fir.ref<i32>
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK:         fir.store {{.*}} to %[[ALLOCA]] : !fir.ref<i32>
 // CHECK-NEXT:    omp.barrier
 // CHECK-NEXT:    omp.terminator
 // CHECK-NEXT:  }
+
+
+// Check the forall-in-workshare pattern: a sequential loop (fir.do_loop) with
+// thread-local index stores, shared memory loads, and a workshare.loop_wrapper.
+// This models the lowered IR for:
+//   !$omp workshare
+//   forall (i=1:n) a(:,i) = a(:,i) + 1
+//   !$omp end workshare
+//
+// Loads from thread-local allocas must remain inside omp.single so that the
+// single's barrier preserves thread synchronization.
+// If reads were also safe-to-parallelize, the entire SingleRegion before the
+// workshare.loop_wrapper could become fully parallelized, eliminating the
+// omp.single and its implicit barrier. This caused race conditions on shared
+// runtime data structures in forall-workshare patterns (see issue #143330).
+
+// CHECK-LABEL: func.func @forall_pattern_in_workshare
+func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
+  omp.parallel {
+    %idx_alloca = fir.alloca i32 {bindc_name = "i", pinned}
+    omp.workshare {
+      %c1 = arith.constant 1 : index
+      %c10 = arith.constant 10 : index
+      fir.do_loop %iv = %c1 to %c10 step %c1 {
+        // Store loop index to thread-local alloca (must be parallelized)
+        %iv_i32 = fir.convert %iv : (index) -> i32
+        fir.store %iv_i32 to %idx_alloca : !fir.ref<i32>
+        // Load from shared memory (must stay in omp.single)
+        %shared_val = fir.load %shared : !fir.ref<i32>
+        // Load from thread-local alloca (must stay in omp.single to
+        // prevent SingleRegion elimination and barrier loss)
+        %idx_val = fir.load %idx_alloca : !fir.ref<i32>
+        // Side-effecting op using both values (must stay in omp.single)
+        "test.side_effect"(%shared_val, %idx_val) : (i32, i32) -> ()
+        // Workshared loop
+        omp.workshare.loop_wrapper {
+          omp.loop_nest (%j) : index = (%c1) to (%c10) inclusive step (%c1) {
+            "test.inner"(%j) : (index) -> ()
+            omp.yield
+          }
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The thread-local load must remain inside omp.single along with the shared
+// load and side-effecting op (preserving the single's barrier). The
+// thread-local store is also cloned inside the single, but a parallel copy
+// is placed after it so all threads update their own alloca.
+// CHECK:       omp.parallel {
+// CHECK:         %[[IDX:.*]] = fir.alloca i32 {bindc_name = "i", pinned}
+// CHECK:         fir.do_loop
+// The single contains the shared load, thread-local load, and side effect.
+// The thread-local store is also cloned inside (harmless, one thread runs it).
+// CHECK:           omp.single {
+// CHECK:             fir.store {{.*}} to %[[IDX]] : !fir.ref<i32>
+// CHECK:             fir.load %{{.*}} : !fir.ref<i32>
+// CHECK:             fir.load %[[IDX]] : !fir.ref<i32>
+// CHECK:             "test.side_effect"
+// CHECK:             omp.terminator
+// CHECK-NEXT:      }
+// The parallelized copy of the store runs after the single's barrier,
+// ensuring all threads update their own thread-local index alloca.
+// CHECK:           fir.store {{.*}} to %[[IDX]] : !fir.ref<i32>
+// CHECK:           omp.wsloop {
+// CHECK:         }
+// CHECK:         omp.barrier
+// CHECK:       }