[flang-commits] [flang] [flang][OpenMP] Fix incorrect handling for local variable in OpenMP parallel workshare firstprivate(P) (PR #195616)
via flang-commits
flang-commits at lists.llvm.org
Tue May 5 01:58:17 PDT 2026
https://github.com/SunilKuravinakop updated https://github.com/llvm/llvm-project/pull/195616
>From dfb65ee076ec5787675df968edc03cc0021b1892 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 4 May 2026 03:39:17 -0500
Subject: [PATCH 1/3] Changes to handle "!$omp parallel workshare
firstprivate(P)" where P is an array. Handling the creation and
initialization of the local copy properly.
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 32 +++++++-
.../OpenMP/workshare-firstprivate-pointer.f90 | 64 +++++++++++++++
.../OpenMP/lower-workshare-thread-local.mlir | 80 +++++++++++++++++++
3 files changed, 175 insertions(+), 1 deletion(-)
create mode 100644 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index a41d8d8826501..cf51cb887622f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -156,6 +156,11 @@ static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
fir::AliasAnalysis aliasAnalysis;
fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
+ // With firstprivate(P) where P is a pointer, each thread gets its own copy
+ // of the descriptor, but P(i) accesses shared target data.
+ if (source.accessPath.hasPointerDeref())
+ return false;
+
// Check if the source is a Value (not a global symbol).
mlir::Value sourceValue =
llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
@@ -370,10 +375,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
SmallVector<Value> copyPrivate;
bool allParallelized = true;
+ // "firstprivate" pointer initialization creates: (1) alloca, (2) store
+ // null box, (3) copy original. If step (2) is duplicated into the
+ // parallel block, it runs after initialization of the private copy and
+ // overwrites the pointer descriptor with null, causing a segfault on
+ // dereference.
+ SmallPtrSet<Value, 4> hoistedCopyprivateAllocas;
+
for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
if (isSafeToParallelize(&op)) {
singleBuilder.clone(op, singleMapping);
- if (llvm::all_of(op.getOperands(), [&](Value opr) {
+ // Check if this operation writes to a hoisted copyprivate alloca.
+ // Such stores must stay only in the single block; the copyprivate
+ // mechanism handles broadcasting the final value to all threads.
+ bool writesToCopyprivateAlloca = false;
+ if (!hoistedCopyprivateAllocas.empty()) {
+ if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(&op)) {
+ SmallVector<MemoryEffects::EffectInstance> effects;
+ memEffects.getEffects(effects);
+ writesToCopyprivateAlloca =
+ llvm::any_of(effects, [&](const auto &eff) {
+ return isa<MemoryEffects::Write>(eff.getEffect()) &&
+ eff.getValue() &&
+ hoistedCopyprivateAllocas.contains(eff.getValue());
+ });
+ }
+ }
+ if (!writesToCopyprivateAlloca &&
+ llvm::all_of(op.getOperands(), [&](Value opr) {
// Either we have already remapped it
bool remapped = rootMapping.contains(opr);
// Or it is available because it dominates `sr`
@@ -399,6 +428,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
rootMapping.map(&*alloca, &*hoisted);
rootMapping.map(alloca.getResult(), hoisted.getResult());
copyPrivate.push_back(hoisted);
+ hoistedCopyprivateAllocas.insert(alloca.getResult());
allParallelized = false;
} else {
singleBuilder.clone(op, singleMapping);
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
new file mode 100644
index 0000000000000..5e08c2dd161ec
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -0,0 +1,64 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+
+! Test that parallel workshare with firstprivate(P) where P is a pointer
+! correctly places stores through the pointer target in omp.single rather
+! than parallelizing them. The pointer descriptor is thread-local (firstprivate),
+! but the target data is shared memory.
+
+subroutine test_workshare_firstprivate_pointer(P)
+ integer, pointer, intent(in) :: P(:)
+ integer :: i
+ !$omp parallel workshare firstprivate(P)
+ forall (i = 1:SIZE(P)) P(i) = i
+ !$omp end parallel workshare
+end subroutine
+
+! HLFIR: omp.parallel {
+! HLFIR: omp.workshare {
+! The firstprivate copy: alloca, zero-init, declare, then copy from original
+! HLFIR: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR: fir.store
+! HLFIR: hlfir.declare
+! HLFIR: fir.load
+! HLFIR: fir.store
+! HLFIR: hlfir.forall
+! HLFIR: omp.terminator
+! HLFIR: }
+! HLFIR: omp.terminator
+! HLFIR: }
+
+! After workshare lowering, the forall body (which stores through the pointer
+! target) must be inside omp.single, not parallelized.
+! FIR: omp.parallel {
+! FIR: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! The firstprivate init + copy and the forall loop must be in omp.single
+! FIR: omp.single copyprivate(%[[DESC]]
+! FIR: fir.store
+! FIR: fir.declare
+! FIR: fir.load
+! FIR: fir.store
+! The forall loop accesses pointer target (shared memory) - must stay in single
+! FIR: fir.do_loop
+! FIR: fir.array_coor
+! FIR: fir.store
+! FIR: omp.terminator
+! FIR: }
+! FIR: omp.barrier
+! FIR: omp.terminator
+
+! At LLVM IR level, verify the OpenMP fork call exists and the loop body
+! is inside the outlined function.
+! LLVM: call void {{.*}}__kmpc_fork_call
+! LLVM: define internal void @test_workshare_firstprivate_pointer_..omp_par
+! The single construct must be present in the outlined function
+! LLVM: call i32 @__kmpc_single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d6000c989515b..12bae176b70d2 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -403,3 +403,83 @@ func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
// CHECK: }
// CHECK: omp.barrier
// CHECK: }
+
+
+// Check that a store through a pointer dereference is NOT considered
+// thread-local, even if the pointer descriptor itself is in a thread-local
+// alloca. This models the "parallel workshare firstprivate(P)" case where P
+// is a Fortran POINTER: each thread gets its own copy of the descriptor, but
+// P(i) accesses shared target data through the pointer.
+
+// CHECK-LABEL: func.func @pointer_deref_not_thread_local
+func.func @pointer_deref_not_thread_local() {
+ omp.parallel {
+ // Thread-local alloca for the pointer descriptor (models firstprivate)
+ %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ %decl = fir.declare %desc {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "p"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ omp.workshare {
+ // Load the pointer box and access the target data via array_coor.
+ // Even though %desc is thread-local, the target data is shared.
+ %box = fir.load %decl : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ %c0 = arith.constant 0 : index
+ %dims:3 = fir.box_dims %box, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+ %shift = fir.shift %dims#0 : (index) -> !fir.shift<1>
+ %c1_i64 = arith.constant 1 : i64
+ %elem = fir.array_coor %box(%shift) %c1_i64 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+ %c42 = arith.constant 42 : i32
+ // This store goes to shared target data (through pointer deref),
+ // so it MUST be in omp.single, not parallelized.
+ fir.store %c42 to %elem : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The store through the pointer dereference must be inside omp.single.
+// CHECK: omp.parallel {
+// CHECK: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK: omp.single
+// CHECK: fir.load
+// CHECK: fir.box_dims
+// CHECK: fir.array_coor
+// CHECK: fir.store
+// CHECK: omp.terminator
+// CHECK-NEXT: }
+// CHECK: omp.barrier
+// CHECK: }
+
+
+// Check that a direct store to the pointer descriptor alloca (not through
+// the pointer target) IS still recognized as thread-local.
+
+// CHECK-LABEL: func.func @pointer_descriptor_store_is_thread_local
+func.func @pointer_descriptor_store_is_thread_local() {
+ omp.parallel {
+ %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ omp.workshare {
+ %null = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+ %c0 = arith.constant 0 : index
+ %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+ %box = fir.embox %null(%shape) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ // This store updates the descriptor itself (thread-local alloca),
+ // NOT the pointer target, so it should be parallelized.
+ fir.store %box to %desc : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The store to the descriptor alloca is thread-local and should NOT be in omp.single.
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK: fir.zero_bits
+// CHECK: fir.shape
+// CHECK: fir.embox
+// CHECK: fir.store {{.*}} to %[[DESC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
>From 7472b41f54f62b4314b1a2508a270c80bbebe6c1 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 00:31:42 -0500
Subject: [PATCH 2/3] Taking care of referenced type values.
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 49 +++++++++++++++----
1 file changed, 39 insertions(+), 10 deletions(-)
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index cf51cb887622f..fe7064b7cda0b 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -361,9 +361,21 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
if (auto reloaded = rootMapping.lookupOrNull(v))
return nullptr;
Type ty = v.getType();
- Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty);
- fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc);
- Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc);
+ // fir.alloca cannot wrap fir.ref, so for reference-typed values
+ // (e.g. results of dynamic fir.alloca ops) use fir.heap as the
+ // intermediary pointer type for the broadcast alloca.
+ Type allocTy = ty;
+ if (auto rt = mlir::dyn_cast<fir::ReferenceType>(ty))
+ allocTy = fir::HeapType::get(rt.getEleTy());
+ Value alloc = fir::AllocaOp::create(allocaBuilder, loc, allocTy);
+ Value singleVal = singleMapping.lookup(v);
+ if (allocTy != ty)
+ singleVal =
+ fir::ConvertOp::create(singleBuilder, loc, allocTy, singleVal);
+ fir::StoreOp::create(singleBuilder, loc, singleVal, alloc);
+ Value reloaded = fir::LoadOp::create(parallelBuilder, loc, allocTy, alloc);
+ if (allocTy != ty)
+ reloaded = fir::ConvertOp::create(parallelBuilder, loc, ty, reloaded);
rootMapping.map(v, reloaded);
return alloc;
};
@@ -423,13 +435,30 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
allParallelized = false;
}
} else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
- auto hoisted =
- cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
- rootMapping.map(&*alloca, &*hoisted);
- rootMapping.map(alloca.getResult(), hoisted.getResult());
- copyPrivate.push_back(hoisted);
- hoistedCopyprivateAllocas.insert(alloca.getResult());
- allParallelized = false;
+ if (alloca.isDynamic()) {
+ // Dynamic allocas (e.g. firstprivate arrays with runtime extent)
+ // cannot use the simple load/store copyprivate copy function
+ // because it only copies a single element for sequence types like
+ // !fir.array<?xi32>. Instead, keep the alloca in the single block
+ // and broadcast only its pointer to all threads.
+ singleBuilder.clone(op, singleMapping);
+ if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
+ auto alloc =
+ mapReloadedValue(alloca.getResult(), allocaBuilder,
+ singleBuilder, parallelBuilder, singleMapping);
+ if (alloc)
+ copyPrivate.push_back(alloc);
+ }
+ allParallelized = false;
+ } else {
+ auto hoisted =
+ cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+ rootMapping.map(&*alloca, &*hoisted);
+ rootMapping.map(alloca.getResult(), hoisted.getResult());
+ copyPrivate.push_back(hoisted);
+ hoistedCopyprivateAllocas.insert(alloca.getResult());
+ allParallelized = false;
+ }
} else {
singleBuilder.clone(op, singleMapping);
// Prepare reloaded values for results of operations that cannot be
>From ed4377c87cb14b338d503e0791d38303e7ac7285 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 03:55:28 -0500
Subject: [PATCH 3/3] Changes in test cases to check for array. In "parallel
workshare firstprivate(z)" z is an array.
---
.../OpenMP/workshare-firstprivate-pointer.f90 | 37 +++++++++++++
.../OpenMP/lower-workshare-thread-local.mlir | 54 +++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index 5e08c2dd161ec..bac5e4dcdc6a0 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -62,3 +62,40 @@ subroutine test_workshare_firstprivate_pointer(P)
! LLVM: define internal void @test_workshare_firstprivate_pointer_..omp_par
! The single construct must be present in the outlined function
! LLVM: call i32 @__kmpc_single
+
+! Test for "workshare firstprivate(z)" where z is an array.
+! Check code to correctly broadcast the address of the firstprivate
+! copy to all threads, instead of using a broken load/store copyprivate
+! that only copies a single element for dynamically-sized arrays.
+
+subroutine test_workshare_firstprivate_array(a, z, n)
+ integer(4) :: n
+ integer(4), dimension(n) :: z, a
+ !$omp parallel workshare firstprivate(z)
+ a = z + 1
+ !$omp end parallel workshare
+end subroutine
+
+! After workshare lowering, the dynamic alloca for the firstprivate copy
+! must be inside omp.single, with its address broadcast via a !fir.heap
+! indirection alloca + copyprivate.
+! FIR: func.func @_QPtest_workshare_firstprivate_array
+! FIR: omp.parallel {
+! The heap indirection alloca is hoisted for copyprivate
+! FIR: fir.alloca !fir.heap<!fir.array<?xi32>>
+! FIR: omp.single copyprivate(
+! The dynamic alloca (firstprivate copy) is inside the single block
+! FIR: fir.alloca !fir.array<?xi32>
+! FIR: fir.convert {{.*}} -> !fir.heap<!fir.array<?xi32>>
+! FIR: fir.store
+! The initialization of the firstprivate copy
+! FIR: fir.call @_FortranAAssign
+! FIR: omp.terminator
+! FIR: }
+! After single, the address is loaded and converted back
+! FIR: fir.load
+! FIR: fir.convert {{.*}} -> !fir.ref<!fir.array<?xi32>>
+! The workshared loop uses the broadcast address
+! FIR: omp.wsloop
+! FIR: omp.barrier
+! FIR: omp.terminator
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index 12bae176b70d2..c938860b1fc1f 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -483,3 +483,57 @@ func.func @pointer_descriptor_store_is_thread_local() {
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
+
+// -----
+
+// Test for "parallel workshare firstprivate(z)" where z is an array.
+// Check that z is broadcast to all private values of the threads.
+
+// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array
+func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
+ omp.parallel {
+ omp.workshare {
+ // Dynamic alloca for the firstprivate array copy
+ %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
+ %shape = fir.shape %n : (index) -> !fir.shape<1>
+ %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+ // A side-effecting op that initializes the firstprivate copy
+ "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+ // Workshared loop that reads from the firstprivate array
+ %c1 = arith.constant 1 : index
+ omp.workshare.loop_wrapper {
+ omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
+ %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ %val = fir.load %elem : !fir.ref<i32>
+ %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ fir.store %val to %dst_elem : !fir.ref<i32>
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The dynamic alloca must be INSIDE the omp.single (not hoisted).
+// A !fir.heap indirection alloca is hoisted for copyprivate.
+// After the single, the array address is loaded and converted back to !fir.ref.
+// CHECK: omp.parallel {
+// CHECK: %[[PTR_ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
+// CHECK: omp.single copyprivate(%[[PTR_ALLOC]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>)
+// The dynamic alloca is inside the single block
+// CHECK: fir.alloca !fir.array<?xi32>
+// CHECK: fir.convert {{.*}} : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK: fir.store {{.*}} to %[[PTR_ALLOC]]
+// CHECK: "test.init"
+// CHECK: omp.terminator
+// CHECK-NEXT: }
+// After single, load the broadcast array address and convert back to ref
+// CHECK: fir.load %[[PTR_ALLOC]]
+// CHECK: fir.convert {{.*}} : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// The workshared loop uses the broadcast array address
+// CHECK: omp.wsloop
+// CHECK: omp.barrier
+// CHECK: }
More information about the flang-commits
mailing list