[flang-commits] [flang] [flang][OpenMP] Fix incorrect handling for local variable in OpenMP parallel workshare firstprivate(P) (PR #195616)
via flang-commits
flang-commits at lists.llvm.org
Mon May 4 01:59:01 PDT 2026
https://github.com/SunilKuravinakop created https://github.com/llvm/llvm-project/pull/195616
Changes to handle "!$omp parallel workshare firstprivate(P)" where P is an array. Handling the creation and initialization of the local copy properly.
This handles issue [195337](https://github.com/llvm/llvm-project/issues/195337) .
>From dfb65ee076ec5787675df968edc03cc0021b1892 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 4 May 2026 03:39:17 -0500
Subject: [PATCH] Changes to handle "!$omp parallel workshare firstprivate(P)"
where P is an array. Handling the creation and initialization of the local
copy properly.
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 32 +++++++-
.../OpenMP/workshare-firstprivate-pointer.f90 | 64 +++++++++++++++
.../OpenMP/lower-workshare-thread-local.mlir | 80 +++++++++++++++++++
3 files changed, 175 insertions(+), 1 deletion(-)
create mode 100644 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index a41d8d8826501..cf51cb887622f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -156,6 +156,11 @@ static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
fir::AliasAnalysis aliasAnalysis;
fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
+ // With firstprivate(P) where P is a pointer, each thread gets its own copy
+ // of the descriptor, but P(i) accesses shared target data.
+ if (source.accessPath.hasPointerDeref())
+ return false;
+
// Check if the source is a Value (not a global symbol).
mlir::Value sourceValue =
llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
@@ -370,10 +375,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
SmallVector<Value> copyPrivate;
bool allParallelized = true;
+ // "firstprivate" pointer initialization creates: (1) alloca, (2) store
+ // null box, (3) copy original. If step (2) is duplicated into the
+ // parallel block, it runs after initialization of the private copy and
+ // overwrites the pointer descriptor with null, causing a segfault on
+ // dereference.
+ SmallPtrSet<Value, 4> hoistedCopyprivateAllocas;
+
for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
if (isSafeToParallelize(&op)) {
singleBuilder.clone(op, singleMapping);
- if (llvm::all_of(op.getOperands(), [&](Value opr) {
+ // Check if this operation writes to a hoisted copyprivate alloca.
+ // Such stores must stay only in the single block; the copyprivate
+ // mechanism handles broadcasting the final value to all threads.
+ bool writesToCopyprivateAlloca = false;
+ if (!hoistedCopyprivateAllocas.empty()) {
+ if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(&op)) {
+ SmallVector<MemoryEffects::EffectInstance> effects;
+ memEffects.getEffects(effects);
+ writesToCopyprivateAlloca =
+ llvm::any_of(effects, [&](const auto &eff) {
+ return isa<MemoryEffects::Write>(eff.getEffect()) &&
+ eff.getValue() &&
+ hoistedCopyprivateAllocas.contains(eff.getValue());
+ });
+ }
+ }
+ if (!writesToCopyprivateAlloca &&
+ llvm::all_of(op.getOperands(), [&](Value opr) {
// Either we have already remapped it
bool remapped = rootMapping.contains(opr);
// Or it is available because it dominates `sr`
@@ -399,6 +428,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
rootMapping.map(&*alloca, &*hoisted);
rootMapping.map(alloca.getResult(), hoisted.getResult());
copyPrivate.push_back(hoisted);
+ hoistedCopyprivateAllocas.insert(alloca.getResult());
allParallelized = false;
} else {
singleBuilder.clone(op, singleMapping);
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
new file mode 100644
index 0000000000000..5e08c2dd161ec
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -0,0 +1,64 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+
+! Test that parallel workshare with firstprivate(P) where P is a pointer
+! correctly places stores through the pointer target in omp.single rather
+! than parallelizing them. The pointer descriptor is thread-local (firstprivate),
+! but the target data is shared memory.
+
+subroutine test_workshare_firstprivate_pointer(P)
+ integer, pointer, intent(in) :: P(:)
+ integer :: i
+ !$omp parallel workshare firstprivate(P)
+ forall (i = 1:SIZE(P)) P(i) = i
+ !$omp end parallel workshare
+end subroutine
+
+! HLFIR: omp.parallel {
+! HLFIR: omp.workshare {
+! The firstprivate copy: alloca, zero-init, declare, then copy from original
+! HLFIR: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR: fir.store
+! HLFIR: hlfir.declare
+! HLFIR: fir.load
+! HLFIR: fir.store
+! HLFIR: hlfir.forall
+! HLFIR: omp.terminator
+! HLFIR: }
+! HLFIR: omp.terminator
+! HLFIR: }
+
+! After workshare lowering, the forall body (which stores through the pointer
+! target) must be inside omp.single, not parallelized.
+! FIR: omp.parallel {
+! FIR: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! The firstprivate init + copy and the forall loop must be in omp.single
+! FIR: omp.single copyprivate(%[[DESC]]
+! FIR: fir.store
+! FIR: fir.declare
+! FIR: fir.load
+! FIR: fir.store
+! The forall loop accesses pointer target (shared memory) - must stay in single
+! FIR: fir.do_loop
+! FIR: fir.array_coor
+! FIR: fir.store
+! FIR: omp.terminator
+! FIR: }
+! FIR: omp.barrier
+! FIR: omp.terminator
+
+! At LLVM IR level, verify the OpenMP fork call exists and the loop body
+! is inside the outlined function.
+! LLVM: call void {{.*}}__kmpc_fork_call
+! LLVM: define internal void @test_workshare_firstprivate_pointer_..omp_par
+! The single construct must be present in the outlined function
+! LLVM: call i32 @__kmpc_single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d6000c989515b..12bae176b70d2 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -403,3 +403,83 @@ func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
// CHECK: }
// CHECK: omp.barrier
// CHECK: }
+
+
+// Check that a store through a pointer dereference is NOT considered
+// thread-local, even if the pointer descriptor itself is in a thread-local
+// alloca. This models the "parallel workshare firstprivate(P)" case where P
+// is a Fortran POINTER: each thread gets its own copy of the descriptor, but
+// P(i) accesses shared target data through the pointer.
+
+// CHECK-LABEL: func.func @pointer_deref_not_thread_local
+func.func @pointer_deref_not_thread_local() {
+ omp.parallel {
+ // Thread-local alloca for the pointer descriptor (models firstprivate)
+ %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ %decl = fir.declare %desc {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "p"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ omp.workshare {
+ // Load the pointer box and access the target data via array_coor.
+ // Even though %desc is thread-local, the target data is shared.
+ %box = fir.load %decl : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ %c0 = arith.constant 0 : index
+ %dims:3 = fir.box_dims %box, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+ %shift = fir.shift %dims#0 : (index) -> !fir.shift<1>
+ %c1_i64 = arith.constant 1 : i64
+ %elem = fir.array_coor %box(%shift) %c1_i64 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+ %c42 = arith.constant 42 : i32
+ // This store goes to shared target data (through pointer deref),
+ // so it MUST be in omp.single, not parallelized.
+ fir.store %c42 to %elem : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The store through the pointer dereference must be inside omp.single.
+// CHECK: omp.parallel {
+// CHECK: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK: omp.single
+// CHECK: fir.load
+// CHECK: fir.box_dims
+// CHECK: fir.array_coor
+// CHECK: fir.store
+// CHECK: omp.terminator
+// CHECK-NEXT: }
+// CHECK: omp.barrier
+// CHECK: }
+
+
+// Check that a direct store to the pointer descriptor alloca (not through
+// the pointer target) IS still recognized as thread-local.
+
+// CHECK-LABEL: func.func @pointer_descriptor_store_is_thread_local
+func.func @pointer_descriptor_store_is_thread_local() {
+ omp.parallel {
+ %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ omp.workshare {
+ %null = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+ %c0 = arith.constant 0 : index
+ %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+ %box = fir.embox %null(%shape) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ // This store updates the descriptor itself (thread-local alloca),
+ // NOT the pointer target, so it should be parallelized.
+ fir.store %box to %desc : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The store to the descriptor alloca is thread-local and should NOT be in omp.single.
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK: fir.zero_bits
+// CHECK: fir.shape
+// CHECK: fir.embox
+// CHECK: fir.store {{.*}} to %[[DESC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
More information about the flang-commits
mailing list