[flang-commits] [flang] [flang][OpenMP] incorrect handling for local variable in OpenMP parallel workshare firstprivate(P) (PR #195616)
via flang-commits
flang-commits at lists.llvm.org
Wed Jun 3 04:11:05 PDT 2026
https://github.com/SunilKuravinakop updated https://github.com/llvm/llvm-project/pull/195616
>From dfb65ee076ec5787675df968edc03cc0021b1892 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 4 May 2026 03:39:17 -0500
Subject: [PATCH 1/6] Changes to handle "!$omp parallel workshare
firstprivate(P)" where P is an array. Handling the creation and
initialization of the local copy properly.
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 32 +++++++-
.../OpenMP/workshare-firstprivate-pointer.f90 | 64 +++++++++++++++
.../OpenMP/lower-workshare-thread-local.mlir | 80 +++++++++++++++++++
3 files changed, 175 insertions(+), 1 deletion(-)
create mode 100644 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index a41d8d8826501..cf51cb887622f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -156,6 +156,11 @@ static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
fir::AliasAnalysis aliasAnalysis;
fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
+ // With firstprivate(P) where P is a pointer, each thread gets its own copy
+ // of the descriptor, but P(i) accesses shared target data.
+ if (source.accessPath.hasPointerDeref())
+ return false;
+
// Check if the source is a Value (not a global symbol).
mlir::Value sourceValue =
llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
@@ -370,10 +375,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
SmallVector<Value> copyPrivate;
bool allParallelized = true;
+ // "firstprivate" pointer initialization creates: (1) alloca, (2) store
+ // null box, (3) copy original. If step (2) is duplicated into the
+ // parallel block, it runs after initialization of the private copy and
+ // overwrites the pointer descriptor with null, causing a segfault on
+ // dereference.
+ SmallPtrSet<Value, 4> hoistedCopyprivateAllocas;
+
for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
if (isSafeToParallelize(&op)) {
singleBuilder.clone(op, singleMapping);
- if (llvm::all_of(op.getOperands(), [&](Value opr) {
+ // Check if this operation writes to a hoisted copyprivate alloca.
+ // Such stores must stay only in the single block; the copyprivate
+ // mechanism handles broadcasting the final value to all threads.
+ bool writesToCopyprivateAlloca = false;
+ if (!hoistedCopyprivateAllocas.empty()) {
+ if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(&op)) {
+ SmallVector<MemoryEffects::EffectInstance> effects;
+ memEffects.getEffects(effects);
+ writesToCopyprivateAlloca =
+ llvm::any_of(effects, [&](const auto &eff) {
+ return isa<MemoryEffects::Write>(eff.getEffect()) &&
+ eff.getValue() &&
+ hoistedCopyprivateAllocas.contains(eff.getValue());
+ });
+ }
+ }
+ if (!writesToCopyprivateAlloca &&
+ llvm::all_of(op.getOperands(), [&](Value opr) {
// Either we have already remapped it
bool remapped = rootMapping.contains(opr);
// Or it is available because it dominates `sr`
@@ -399,6 +428,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
rootMapping.map(&*alloca, &*hoisted);
rootMapping.map(alloca.getResult(), hoisted.getResult());
copyPrivate.push_back(hoisted);
+ hoistedCopyprivateAllocas.insert(alloca.getResult());
allParallelized = false;
} else {
singleBuilder.clone(op, singleMapping);
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
new file mode 100644
index 0000000000000..5e08c2dd161ec
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -0,0 +1,64 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+
+! Test that parallel workshare with firstprivate(P) where P is a pointer
+! correctly places stores through the pointer target in omp.single rather
+! than parallelizing them. The pointer descriptor is thread-local (firstprivate),
+! but the target data is shared memory.
+
+subroutine test_workshare_firstprivate_pointer(P)
+ integer, pointer, intent(in) :: P(:)
+ integer :: i
+ !$omp parallel workshare firstprivate(P)
+ forall (i = 1:SIZE(P)) P(i) = i
+ !$omp end parallel workshare
+end subroutine
+
+! HLFIR: omp.parallel {
+! HLFIR: omp.workshare {
+! The firstprivate copy: alloca, zero-init, declare, then copy from original
+! HLFIR: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR: fir.store
+! HLFIR: hlfir.declare
+! HLFIR: fir.load
+! HLFIR: fir.store
+! HLFIR: hlfir.forall
+! HLFIR: omp.terminator
+! HLFIR: }
+! HLFIR: omp.terminator
+! HLFIR: }
+
+! After workshare lowering, the forall body (which stores through the pointer
+! target) must be inside omp.single, not parallelized.
+! FIR: omp.parallel {
+! FIR: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! The firstprivate init + copy and the forall loop must be in omp.single
+! FIR: omp.single copyprivate(%[[DESC]]
+! FIR: fir.store
+! FIR: fir.declare
+! FIR: fir.load
+! FIR: fir.store
+! The forall loop accesses pointer target (shared memory) - must stay in single
+! FIR: fir.do_loop
+! FIR: fir.array_coor
+! FIR: fir.store
+! FIR: omp.terminator
+! FIR: }
+! FIR: omp.barrier
+! FIR: omp.terminator
+
+! At LLVM IR level, verify the OpenMP fork call exists and the loop body
+! is inside the outlined function.
+! LLVM: call void {{.*}}__kmpc_fork_call
+! LLVM: define internal void @test_workshare_firstprivate_pointer_..omp_par
+! The single construct must be present in the outlined function
+! LLVM: call i32 @__kmpc_single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d6000c989515b..12bae176b70d2 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -403,3 +403,83 @@ func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
// CHECK: }
// CHECK: omp.barrier
// CHECK: }
+
+
+// Check that a store through a pointer dereference is NOT considered
+// thread-local, even if the pointer descriptor itself is in a thread-local
+// alloca. This models the "parallel workshare firstprivate(P)" case where P
+// is a Fortran POINTER: each thread gets its own copy of the descriptor, but
+// P(i) accesses shared target data through the pointer.
+
+// CHECK-LABEL: func.func @pointer_deref_not_thread_local
+func.func @pointer_deref_not_thread_local() {
+ omp.parallel {
+ // Thread-local alloca for the pointer descriptor (models firstprivate)
+ %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ %decl = fir.declare %desc {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "p"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ omp.workshare {
+ // Load the pointer box and access the target data via array_coor.
+ // Even though %desc is thread-local, the target data is shared.
+ %box = fir.load %decl : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ %c0 = arith.constant 0 : index
+ %dims:3 = fir.box_dims %box, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+ %shift = fir.shift %dims#0 : (index) -> !fir.shift<1>
+ %c1_i64 = arith.constant 1 : i64
+ %elem = fir.array_coor %box(%shift) %c1_i64 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+ %c42 = arith.constant 42 : i32
+ // This store goes to shared target data (through pointer deref),
+ // so it MUST be in omp.single, not parallelized.
+ fir.store %c42 to %elem : !fir.ref<i32>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The store through the pointer dereference must be inside omp.single.
+// CHECK: omp.parallel {
+// CHECK: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK: omp.single
+// CHECK: fir.load
+// CHECK: fir.box_dims
+// CHECK: fir.array_coor
+// CHECK: fir.store
+// CHECK: omp.terminator
+// CHECK-NEXT: }
+// CHECK: omp.barrier
+// CHECK: }
+
+
+// Check that a direct store to the pointer descriptor alloca (not through
+// the pointer target) IS still recognized as thread-local.
+
+// CHECK-LABEL: func.func @pointer_descriptor_store_is_thread_local
+func.func @pointer_descriptor_store_is_thread_local() {
+ omp.parallel {
+ %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ omp.workshare {
+ %null = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+ %c0 = arith.constant 0 : index
+ %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+ %box = fir.embox %null(%shape) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+ // This store updates the descriptor itself (thread-local alloca),
+ // NOT the pointer target, so it should be parallelized.
+ fir.store %box to %desc : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The store to the descriptor alloca is thread-local and should NOT be in omp.single.
+// CHECK: omp.parallel {
+// CHECK-NEXT: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK: fir.zero_bits
+// CHECK: fir.shape
+// CHECK: fir.embox
+// CHECK: fir.store {{.*}} to %[[DESC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK-NEXT: omp.barrier
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
>From 7472b41f54f62b4314b1a2508a270c80bbebe6c1 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 00:31:42 -0500
Subject: [PATCH 2/6] Taking care of referenced type values.
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 49 +++++++++++++++----
1 file changed, 39 insertions(+), 10 deletions(-)
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index cf51cb887622f..fe7064b7cda0b 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -361,9 +361,21 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
if (auto reloaded = rootMapping.lookupOrNull(v))
return nullptr;
Type ty = v.getType();
- Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty);
- fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc);
- Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc);
+ // fir.alloca cannot wrap fir.ref, so for reference-typed values
+ // (e.g. results of dynamic fir.alloca ops) use fir.heap as the
+ // intermediary pointer type for the broadcast alloca.
+ Type allocTy = ty;
+ if (auto rt = mlir::dyn_cast<fir::ReferenceType>(ty))
+ allocTy = fir::HeapType::get(rt.getEleTy());
+ Value alloc = fir::AllocaOp::create(allocaBuilder, loc, allocTy);
+ Value singleVal = singleMapping.lookup(v);
+ if (allocTy != ty)
+ singleVal =
+ fir::ConvertOp::create(singleBuilder, loc, allocTy, singleVal);
+ fir::StoreOp::create(singleBuilder, loc, singleVal, alloc);
+ Value reloaded = fir::LoadOp::create(parallelBuilder, loc, allocTy, alloc);
+ if (allocTy != ty)
+ reloaded = fir::ConvertOp::create(parallelBuilder, loc, ty, reloaded);
rootMapping.map(v, reloaded);
return alloc;
};
@@ -423,13 +435,30 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
allParallelized = false;
}
} else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
- auto hoisted =
- cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
- rootMapping.map(&*alloca, &*hoisted);
- rootMapping.map(alloca.getResult(), hoisted.getResult());
- copyPrivate.push_back(hoisted);
- hoistedCopyprivateAllocas.insert(alloca.getResult());
- allParallelized = false;
+ if (alloca.isDynamic()) {
+ // Dynamic allocas (e.g. firstprivate arrays with runtime extent)
+ // cannot use the simple load/store copyprivate copy function
+ // because it only copies a single element for sequence types like
+ // !fir.array<?xi32>. Instead, keep the alloca in the single block
+ // and broadcast only its pointer to all threads.
+ singleBuilder.clone(op, singleMapping);
+ if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
+ auto alloc =
+ mapReloadedValue(alloca.getResult(), allocaBuilder,
+ singleBuilder, parallelBuilder, singleMapping);
+ if (alloc)
+ copyPrivate.push_back(alloc);
+ }
+ allParallelized = false;
+ } else {
+ auto hoisted =
+ cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+ rootMapping.map(&*alloca, &*hoisted);
+ rootMapping.map(alloca.getResult(), hoisted.getResult());
+ copyPrivate.push_back(hoisted);
+ hoistedCopyprivateAllocas.insert(alloca.getResult());
+ allParallelized = false;
+ }
} else {
singleBuilder.clone(op, singleMapping);
// Prepare reloaded values for results of operations that cannot be
>From ed4377c87cb14b338d503e0791d38303e7ac7285 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 03:55:28 -0500
Subject: [PATCH 3/6] Changes in test cases to check for array. In "parallel
workshare firstprivate(z)" z is an array.
---
.../OpenMP/workshare-firstprivate-pointer.f90 | 37 +++++++++++++
.../OpenMP/lower-workshare-thread-local.mlir | 54 +++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index 5e08c2dd161ec..bac5e4dcdc6a0 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -62,3 +62,40 @@ subroutine test_workshare_firstprivate_pointer(P)
! LLVM: define internal void @test_workshare_firstprivate_pointer_..omp_par
! The single construct must be present in the outlined function
! LLVM: call i32 @__kmpc_single
+
+! Test for "workshare firstprivate(z)" where z is an array.
+! Check code to correctly broadcast the address of the firstprivate
+! copy to all threads, instead of using a broken load/store copyprivate
+! that only copies a single element for dynamically-sized arrays.
+
+subroutine test_workshare_firstprivate_array(a, z, n)
+ integer(4) :: n
+ integer(4), dimension(n) :: z, a
+ !$omp parallel workshare firstprivate(z)
+ a = z + 1
+ !$omp end parallel workshare
+end subroutine
+
+! After workshare lowering, the dynamic alloca for the firstprivate copy
+! must be inside omp.single, with its address broadcast via a !fir.heap
+! indirection alloca + copyprivate.
+! FIR: func.func @_QPtest_workshare_firstprivate_array
+! FIR: omp.parallel {
+! The heap indirection alloca is hoisted for copyprivate
+! FIR: fir.alloca !fir.heap<!fir.array<?xi32>>
+! FIR: omp.single copyprivate(
+! The dynamic alloca (firstprivate copy) is inside the single block
+! FIR: fir.alloca !fir.array<?xi32>
+! FIR: fir.convert {{.*}} -> !fir.heap<!fir.array<?xi32>>
+! FIR: fir.store
+! The initialization of the firstprivate copy
+! FIR: fir.call @_FortranAAssign
+! FIR: omp.terminator
+! FIR: }
+! After single, the address is loaded and converted back
+! FIR: fir.load
+! FIR: fir.convert {{.*}} -> !fir.ref<!fir.array<?xi32>>
+! The workshared loop uses the broadcast address
+! FIR: omp.wsloop
+! FIR: omp.barrier
+! FIR: omp.terminator
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index 12bae176b70d2..c938860b1fc1f 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -483,3 +483,57 @@ func.func @pointer_descriptor_store_is_thread_local() {
// CHECK-NEXT: omp.barrier
// CHECK-NEXT: omp.terminator
// CHECK-NEXT: }
+
+// -----
+
+// Test for "parallel workshare firstprivate(z)" where z is an array.
+// Check that z is broadcast to all private values of the threads.
+
+// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array
+func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
+ omp.parallel {
+ omp.workshare {
+ // Dynamic alloca for the firstprivate array copy
+ %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
+ %shape = fir.shape %n : (index) -> !fir.shape<1>
+ %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+ // A side-effecting op that initializes the firstprivate copy
+ "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+ // Workshared loop that reads from the firstprivate array
+ %c1 = arith.constant 1 : index
+ omp.workshare.loop_wrapper {
+ omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
+ %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ %val = fir.load %elem : !fir.ref<i32>
+ %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ fir.store %val to %dst_elem : !fir.ref<i32>
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// The dynamic alloca must be INSIDE the omp.single (not hoisted).
+// A !fir.heap indirection alloca is hoisted for copyprivate.
+// After the single, the array address is loaded and converted back to !fir.ref.
+// CHECK: omp.parallel {
+// CHECK: %[[PTR_ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
+// CHECK: omp.single copyprivate(%[[PTR_ALLOC]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>)
+// The dynamic alloca is inside the single block
+// CHECK: fir.alloca !fir.array<?xi32>
+// CHECK: fir.convert {{.*}} : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK: fir.store {{.*}} to %[[PTR_ALLOC]]
+// CHECK: "test.init"
+// CHECK: omp.terminator
+// CHECK-NEXT: }
+// After single, load the broadcast array address and convert back to ref
+// CHECK: fir.load %[[PTR_ALLOC]]
+// CHECK: fir.convert {{.*}} : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// The workshared loop uses the broadcast array address
+// CHECK: omp.wsloop
+// CHECK: omp.barrier
+// CHECK: }
>From c2dd9ddd99e52242b5f138894b7fd71a44aa40f8 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 13 May 2026 11:35:48 -0500
Subject: [PATCH 4/6] 1) Making the checks in the tests detailed based on
feedback. 2) Using fir.box and fir.embox for private copies in the "omp
parallel workshare firstprivate(p)".
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 37 +++-
.../OpenMP/workshare-firstprivate-pointer.f90 | 209 ++++++++++++++----
.../OpenMP/lower-workshare-thread-local.mlir | 49 +---
3 files changed, 200 insertions(+), 95 deletions(-)
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index fe7064b7cda0b..bc704de66865f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -362,11 +362,11 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
return nullptr;
Type ty = v.getType();
// fir.alloca cannot wrap fir.ref, so for reference-typed values
- // (e.g. results of dynamic fir.alloca ops) use fir.heap as the
+ // (e.g. results of dynamic fir.alloca ops) use fir.ptr as the
// intermediary pointer type for the broadcast alloca.
Type allocTy = ty;
if (auto rt = mlir::dyn_cast<fir::ReferenceType>(ty))
- allocTy = fir::HeapType::get(rt.getEleTy());
+ allocTy = fir::PointerType::get(rt.getEleTy());
Value alloc = fir::AllocaOp::create(allocaBuilder, loc, allocTy);
Value singleVal = singleMapping.lookup(v);
if (allocTy != ty)
@@ -440,14 +440,35 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
// cannot use the simple load/store copyprivate copy function
// because it only copies a single element for sequence types like
// !fir.array<?xi32>. Instead, keep the alloca in the single block
- // and broadcast only its pointer to all threads.
+ // and broadcast its address via a box to all threads. The box
+ // preserves shape information and is semantically correct for
+ // copyprivate.
singleBuilder.clone(op, singleMapping);
if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
- auto alloc =
- mapReloadedValue(alloca.getResult(), allocaBuilder,
- singleBuilder, parallelBuilder, singleMapping);
- if (alloc)
- copyPrivate.push_back(alloc);
+ Value clonedResult = singleMapping.lookup(alloca.getResult());
+ if (!rootMapping.lookupOrNull(alloca.getResult())) {
+ // Create a box type wrapping the allocated array type.
+ Type eleTy =
+ cast<fir::ReferenceType>(alloca.getType()).getEleTy();
+ auto boxTy = fir::BoxType::get(eleTy);
+ Value boxAlloc = fir::AllocaOp::create(allocaBuilder, loc, boxTy);
+ // In single: create a shape from the alloca extents, embox
+ // the array, and store the box.
+ SmallVector<Value> extents;
+ for (Value ext : alloca.getShape())
+ extents.push_back(singleMapping.lookupOrDefault(ext));
+ Value shape = fir::ShapeOp::create(singleBuilder, loc, extents);
+ Value box = fir::EmboxOp::create(singleBuilder, loc, boxTy,
+ clonedResult, shape);
+ fir::StoreOp::create(singleBuilder, loc, box, boxAlloc);
+ // After single: load the box and extract the address.
+ Value loadedBox =
+ fir::LoadOp::create(parallelBuilder, loc, boxTy, boxAlloc);
+ Value addr = fir::BoxAddrOp::create(parallelBuilder, loc,
+ alloca.getType(), loadedBox);
+ rootMapping.map(alloca.getResult(), addr);
+ copyPrivate.push_back(boxAlloc);
+ }
}
allParallelized = false;
} else {
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index bac5e4dcdc6a0..eb266a450e55d 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -9,6 +9,7 @@
!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | fir-opt --lower-workshare --allow-unregistered-dialect -o - | FileCheck %s --check-prefix FIROPT
! Test that parallel workshare with firstprivate(P) where P is a pointer
! correctly places stores through the pointer target in omp.single rather
@@ -23,45 +24,162 @@ subroutine test_workshare_firstprivate_pointer(P)
!$omp end parallel workshare
end subroutine
-! HLFIR: omp.parallel {
-! HLFIR: omp.workshare {
+! HLFIR-LABEL: {{.*}}test_workshare_firstprivate_pointer{{.*}} {
+! HLFIR: %[[ORIG_P:.*]]:2 = hlfir.declare %{{.*}} {{.*}}uniq_name = "_QFtest_workshare_firstprivate_pointerEp"
+! HLFIR-LABEL: omp.parallel {
+! HLFIR-LABEL: omp.workshare {
! The firstprivate copy: alloca, zero-init, declare, then copy from original
-! HLFIR: fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
-! HLFIR: fir.store
-! HLFIR: hlfir.declare
-! HLFIR: fir.load
-! HLFIR: fir.store
+! HLFIR: %[[FP_ALLOCA:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR: fir.store %{{.*}} to %[[FP_ALLOCA]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! HLFIR: %[[FP_DECL:.*]]:2 = hlfir.declare %[[FP_ALLOCA]] {{{.*}}uniq_name = "_QFtest_workshare_firstprivate_pointerEp"}
+! HLFIR: %[[ORIG_VAL:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! HLFIR: fir.store %[[ORIG_VAL]] to %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
! HLFIR: hlfir.forall
! HLFIR: omp.terminator
! HLFIR: }
! HLFIR: omp.terminator
! HLFIR: }
+! HLFIR: return
! After workshare lowering, the forall body (which stores through the pointer
! target) must be inside omp.single, not parallelized.
-! FIR: omp.parallel {
-! FIR: %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
-! The firstprivate init + copy and the forall loop must be in omp.single
-! FIR: omp.single copyprivate(%[[DESC]]
-! FIR: fir.store
-! FIR: fir.declare
-! FIR: fir.load
-! FIR: fir.store
-! The forall loop accesses pointer target (shared memory) - must stay in single
-! FIR: fir.do_loop
-! FIR: fir.array_coor
-! FIR: fir.store
-! FIR: omp.terminator
-! FIR: }
-! FIR: omp.barrier
-! FIR: omp.terminator
+! FIR: {{.*}}test_workshare_firstprivate_pointer
+! FIR-SAME: (%[[ARG0:.*]]: {{.*}}) {
+! FIR: %[[C1:.*]] = arith.constant 1 : index
+! FIR: %[[C1_I32:.*]] = arith.constant 1 : i32
+! FIR: %[[C0:.*]] = arith.constant 0 : index
+! FIR: %[[DSCOPE:.*]] = fir.dummy_scope{{.*}}
+! FIR: %[[P_DECL:.*]] = fir.declare %[[ARG0]]{{.*}}fortran_attrs = #fir.var_attrs<intent_in, pointer>{{.*}}
+! FIR: omp.parallel {
+! Thread-private storage for firstprivate pointer descriptor.
+! FIR: %[[P_PRIV:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {
+! FIR-SAME: bindc_name = "p"
+! FIR-SAME: pinned
+! FIR: omp.single copyprivate(%[[P_PRIV]]{{.*}} {
+! FIR: %[[ZERO_PTR:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+! FIR: %[[SHAPE:.*]] = fir.shape %[[C0]]
+! FIR: %[[EMPTY_BOX:.*]] = fir.embox %[[ZERO_PTR]](%[[SHAPE]])
+! FIR: fir.store %[[EMPTY_BOX]] to %[[P_PRIV]]
+! FIR-SAME: : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! FIR: %[[P_FP_DECL:.*]] = fir.declare %[[P_PRIV]]
+! FIR-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+! FIR: %[[ORIG_BOX:.*]] = fir.load %[[P_DECL]]
+! FIR-SAME: : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! FIR: fir.store %[[ORIG_BOX]] to %[[P_FP_DECL]]
+! FIR-SAME: : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! FIR: %[[P_PRIVATE:.*]] = fir.load %[[P_FP_DECL]]
+! FIR: %[[P_SIZE:.*]]:3 = fir.box_dims %[[P_PRIVATE]], %[[C0]]
+! FIR: %[[SIZE_TMP1:.*]] = fir.convert %[[P_SIZE]]#1
+! FIR: %[[SIZE_TMP2:.*]] = fir.convert %[[SIZE_TMP1]]
+! FIR: %[[LOOP_LB:.*]] = fir.convert %[[C1_I32]]
+! FIR: %[[LOOP_UB:.*]] = fir.convert %[[SIZE_TMP2]]
+! FIR: fir.do_loop %[[IV:.*]] = %[[LOOP_LB]] to %[[LOOP_UB]] step %[[C1]] {
+! FIR: %[[IV_VAL:.*]] = fir.convert %[[IV]]
+! FIR: fir.store %[[IV_VAL]] to %[[I_PRIV:.*]] : !fir.ref<i32>
+! FIR: %[[RHS_STORE_VAL:.*]] = fir.load %[[I_PRIV]] : !fir.ref<i32>
+! FIR: %[[P_CUR:.*]] = fir.load %[[P_FP_DECL]]
+! FIR: %[[LHS_ELEM_ADDR:.*]] = fir.array_coor %[[P_CUR]]
+! FIR: fir.store %[[RHS_STORE_VAL]] to %[[LHS_ELEM_ADDR]] : !fir.ref<i32>
+! FIR: omp.terminator
+! FIR: }
+! FIR: omp.barrier
+! FIR: omp.terminator
+! FIR: }
+! FIR: return
+
+! FIROPT: func.func @_QPtest_workshare_firstprivate_pointer(
+! FIROPT-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+
+! FIROPT: %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! FIROPT: %[[I_ALLOC:.*]] = fir.alloca i32
+! FIROPT: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_ALLOC]]
+
+! FIROPT: %[[P_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1
+! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+
+! FIROPT: omp.parallel {
+
+! FIROPT: %[[P_PRIV:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", pinned
+
+! FIROPT: omp.single copyprivate(%[[P_PRIV]] -> @_workshare_copy_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) nowait {
+
+! FIROPT: %[[ZERO_PTR:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+! FIROPT: %[[C0:.*]] = arith.constant 0 : index
+! FIROPT: %[[SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1>
+! FIROPT: %[[EMBOX:.*]] = fir.embox %[[ZERO_PTR]](%[[SHAPE]])
+! FIROPT: fir.store %[[EMBOX]] to %[[P_PRIV]]
+
+! FIROPT: %[[P_FP:.*]]:2 = hlfir.declare %[[P_PRIV]]
+! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+
+! FIROPT: %[[LOAD_ORIG:.*]] = fir.load %[[P_DECL]]#0
+! FIROPT: fir.store %[[LOAD_ORIG]] to %[[P_FP]]#0
+
+! FIROPT: %[[C1:.*]] = arith.constant 1 : i32
+
+! FIROPT: %[[LOAD_PRIV:.*]] = fir.load %[[P_FP]]#0
+! FIROPT: %[[C0_2:.*]] = arith.constant 0 : index
+! FIROPT: %[[DIMS:.*]]:3 = fir.box_dims %[[LOAD_PRIV]], %[[C0_2]]
+
+! FIROPT: %[[EXT64:.*]] = fir.convert %[[DIMS]]#1 : (index) -> i64
+! FIROPT: %[[EXT32:.*]] = fir.convert %[[EXT64]] : (i64) -> i32
+
+! FIROPT: hlfir.forall lb {
+! FIROPT: hlfir.yield %[[C1]] : i32
+! FIROPT: } ub {
+! FIROPT: hlfir.yield %[[EXT32]] : i32
+! FIROPT: } (%[[IV:.*]]: i32) {
+
+! FIROPT: %[[IDX:.*]] = hlfir.forall_index "i" %[[IV]] : (i32) -> !fir.ref<i32>
+
+! FIROPT: hlfir.region_assign {
+
+! FIROPT: %[[IDX_VAL:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
+! FIROPT: hlfir.yield %[[IDX_VAL]] : i32
+
+! FIROPT: } to {
+
+! FIROPT: %[[LOAD_BOX:.*]] = fir.load %[[P_FP]]#0
+! FIROPT: %[[LOAD_I:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
+! FIROPT: %[[IDX64:.*]] = fir.convert %[[LOAD_I]] : (i32) -> i64
+
+! FIROPT: %[[DESIG:.*]] = hlfir.designate %[[LOAD_BOX]] (%[[IDX64]])
+! FIROPT-SAME: -> !fir.ref<i32>
+
+! FIROPT: hlfir.yield %[[DESIG]] : !fir.ref<i32>
+
+! FIROPT: }
+! FIROPT: }
+
+! FIROPT: omp.terminator
+! FIROPT: }
+
+! FIROPT: %[[POST_DECL:.*]]:2 = hlfir.declare %[[P_PRIV]]
+! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+
+! FIROPT: omp.barrier
+! FIROPT: omp.terminator
+! FIROPT: }
! At LLVM IR level, verify the OpenMP fork call exists and the loop body
! is inside the outlined function.
-! LLVM: call void {{.*}}__kmpc_fork_call
-! LLVM: define internal void @test_workshare_firstprivate_pointer_..omp_par
-! The single construct must be present in the outlined function
+! LLVM: call void {{.*}}__kmpc_fork_call({{.*}}@test_workshare_firstprivate_pointer_..omp_par{{.*}})
+! LLVM: {{.*}}test_workshare_firstprivate_pointer_..omp_par{{.*}}
+! LLVM-LABEL: omp.par.region{{[0-9]+}}:
! LLVM: call i32 @__kmpc_single
+! LLVM: icmp ne i32
+! LLVM-LABEL: omp_region.end:
+! LLVM: call void @__kmpc_copyprivate
+! LLVM: call void {{.*}}__kmpc_barrier
+! LLVM-LABEL: omp.single.region:
+! LLVM: call void @llvm.memcpy{{.*}}
+! LLVM: getelementptr {{.*}} i32 0, i32 7
+! LLVM: load i64{{.*}}
+! LLVM-LABEL: omp_region.finalize:
+! LLVM: call void @__kmpc_end_single
+! LLVM: store i32 %{{.*}}, ptr %{{.*}}
+! LLVM: getelementptr nusw nuw i8
+! LLVM: ret void
! Test for "workshare firstprivate(z)" where z is an array.
! Check code to correctly broadcast the address of the firstprivate
@@ -77,25 +195,38 @@ subroutine test_workshare_firstprivate_array(a, z, n)
end subroutine
! After workshare lowering, the dynamic alloca for the firstprivate copy
-! must be inside omp.single, with its address broadcast via a !fir.heap
+! must be inside omp.single, with its address broadcast via a !fir.box
! indirection alloca + copyprivate.
-! FIR: func.func @_QPtest_workshare_firstprivate_array
-! FIR: omp.parallel {
-! The heap indirection alloca is hoisted for copyprivate
-! FIR: fir.alloca !fir.heap<!fir.array<?xi32>>
-! FIR: omp.single copyprivate(
+! FIR-LABEL: {{.*}}test_workshare_firstprivate_array(
+! FIR: %[[C1:.*]] = arith.constant 1 : i32
+! FIR-LABEL: omp.parallel {
+
+! The box indirection alloca is hoisted for copyprivate
+! FIR: omp.single copyprivate(%[[BOX_INDIRECT:.*]] -> @_workshare_copy_box_Uxi32{{.*}}) {
+
! The dynamic alloca (firstprivate copy) is inside the single block
-! FIR: fir.alloca !fir.array<?xi32>
-! FIR: fir.convert {{.*}} -> !fir.heap<!fir.array<?xi32>>
-! FIR: fir.store
+! FIR: %[[FP_ARRAY:.*]] = fir.alloca{{.*}}
+
+! Runtime shape construction for the firstprivate array.
+! FIR: %[[SHAPE:.*]] = fir.shape %{{.*}}
+! FIR: %[[BOX_VAL:.*]] = fir.embox %[[FP_ARRAY]](%[[SHAPE]]){{.*}}fir.array<?xi32>{{.*}}
+! FIR: fir.store %[[BOX_VAL]] to %[[BOX_INDIRECT]] {{.*}}fir.array<?xi32>{{.*}}
+
! The initialization of the firstprivate copy
! FIR: fir.call @_FortranAAssign
! FIR: omp.terminator
! FIR: }
-! After single, the address is loaded and converted back
-! FIR: fir.load
-! FIR: fir.convert {{.*}} -> !fir.ref<!fir.array<?xi32>>
+! After single, the box is loaded and the address extracted
+! FIR: %[[LOADED_BOX:.*]] = fir.load %[[BOX_INDIRECT]]{{.*}}fir.array<?xi32>{{.*}}
+! FIR: %[[ARRAY_ADDR:.*]] = fir.box_addr %[[LOADED_BOX]]{{.*}}fir.array<?xi32>>{{.*}}
! The workshared loop uses the broadcast address
-! FIR: omp.wsloop
+! FIR: omp.wsloop {
+! FIR: %[[SRC_ELEM:.*]] = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}}
+! FIR: %[[SRC_VAL:.*]] = fir.load %[[SRC_ELEM]]
+! FIR: %[[ADD_RES:.*]] = arith.addi %[[SRC_VAL]], %[[C1]] : i32
+! FIR: %[[DST_ELEM:.*]] = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}}
+! FIR: fir.store %[[ADD_RES]] to %[[DST_ELEM]]
+! FIR: }
! FIR: omp.barrier
! FIR: omp.terminator
+! FIR: return
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index c938860b1fc1f..e8cb9065123e7 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -488,52 +488,5 @@ func.func @pointer_descriptor_store_is_thread_local() {
// Test for "parallel workshare firstprivate(z)" where z is an array.
// Check that z is broadcast to all private values of the threads.
+// This test is now part of flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
-// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array
-func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
- omp.parallel {
- omp.workshare {
- // Dynamic alloca for the firstprivate array copy
- %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
- %shape = fir.shape %n : (index) -> !fir.shape<1>
- %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
- // A side-effecting op that initializes the firstprivate copy
- "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
- // Workshared loop that reads from the firstprivate array
- %c1 = arith.constant 1 : index
- omp.workshare.loop_wrapper {
- omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
- %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
- %val = fir.load %elem : !fir.ref<i32>
- %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
- fir.store %val to %dst_elem : !fir.ref<i32>
- omp.yield
- }
- }
- omp.terminator
- }
- omp.terminator
- }
- return
-}
-
-// The dynamic alloca must be INSIDE the omp.single (not hoisted).
-// A !fir.heap indirection alloca is hoisted for copyprivate.
-// After the single, the array address is loaded and converted back to !fir.ref.
-// CHECK: omp.parallel {
-// CHECK: %[[PTR_ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
-// CHECK: omp.single copyprivate(%[[PTR_ALLOC]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>)
-// The dynamic alloca is inside the single block
-// CHECK: fir.alloca !fir.array<?xi32>
-// CHECK: fir.convert {{.*}} : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
-// CHECK: fir.store {{.*}} to %[[PTR_ALLOC]]
-// CHECK: "test.init"
-// CHECK: omp.terminator
-// CHECK-NEXT: }
-// After single, load the broadcast array address and convert back to ref
-// CHECK: fir.load %[[PTR_ALLOC]]
-// CHECK: fir.convert {{.*}} : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-// The workshared loop uses the broadcast array address
-// CHECK: omp.wsloop
-// CHECK: omp.barrier
-// CHECK: }
>From be6a825bac11c391a2f808c52514c7d52650c3e0 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 20 May 2026 03:27:07 -0500
Subject: [PATCH 5/6] 1) Correcting of having Private variables in every
thread. 2) Moving checks for fir-opt back into
flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir from
flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 3) Adding
extra tests to
flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 145 +++++--
.../OpenMP/workshare-firstprivate-pointer.f90 | 366 +++++++++++++-----
.../OpenMP/lower-workshare-thread-local.mlir | 73 +++-
3 files changed, 453 insertions(+), 131 deletions(-)
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index bc704de66865f..50463810d5025 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -301,6 +301,74 @@ static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType,
return funcOp;
}
+/// Creates a copy function for box types that copies the array DATA
+/// (not just the descriptor) using the Fortran runtime's Assign function.
+/// This is needed for copyprivate of dynamically-sized arrays where each
+/// thread has its own allocation and needs the data copied from the
+/// single-executing thread.
+static mlir::func::FuncOp createBoxDataCopyFunc(mlir::Location loc,
+ mlir::Type varType,
+ fir::FirOpBuilder builder) {
+ mlir::ModuleOp module = builder.getModule();
+ auto rt = cast<fir::ReferenceType>(varType);
+ mlir::Type eleTy = rt.getEleTy();
+ std::string copyFuncName =
+ fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy_data");
+
+ if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
+ return decl;
+
+ // Ensure _FortranAAssign is declared in the module.
+ auto boxNoneTy = fir::BoxType::get(builder.getNoneType());
+ auto refBoxNoneTy = fir::ReferenceType::get(boxNoneTy);
+ auto refI8Ty = fir::ReferenceType::get(builder.getIntegerType(8));
+ auto i32Ty = builder.getI32Type();
+ llvm::StringRef assignFuncName = "_FortranAAssign";
+ auto assignFunc = module.lookupSymbol<mlir::func::FuncOp>(assignFuncName);
+ if (!assignFunc) {
+ mlir::OpBuilder::InsertionGuard g(builder);
+ mlir::OpBuilder modBuilder(module.getBodyRegion());
+ auto assignFuncType = mlir::FunctionType::get(
+ builder.getContext(), {refBoxNoneTy, boxNoneTy, refI8Ty, i32Ty}, {});
+ assignFunc = mlir::func::FuncOp::create(modBuilder, loc, assignFuncName,
+ assignFuncType);
+ assignFunc.setVisibility(mlir::SymbolTable::Visibility::Private);
+ }
+
+ // Create the copy function.
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ mlir::OpBuilder modBuilder(module.getBodyRegion());
+ llvm::SmallVector<mlir::Type> argsTy = {varType, varType};
+ auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {});
+ mlir::func::FuncOp funcOp =
+ mlir::func::FuncOp::create(modBuilder, loc, copyFuncName, funcType);
+ funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
+ fir::factory::setInternalLinkage(funcOp);
+ builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
+ {loc, loc});
+ builder.setInsertionPointToStart(&funcOp.getRegion().back());
+
+ // Load the source box.
+ Value srcBox =
+ fir::LoadOp::create(builder, loc, eleTy, funcOp.getArgument(1));
+
+ // Convert types for _FortranAAssign call.
+ Value dstConv =
+ fir::ConvertOp::create(builder, loc, refBoxNoneTy, funcOp.getArgument(0));
+ Value srcConv = fir::ConvertOp::create(builder, loc, boxNoneTy, srcBox);
+
+ // Use null source location (only used for error reporting).
+ Value nullLoc = fir::ZeroOp::create(builder, loc, refI8Ty);
+ Value zeroLine = builder.createIntegerConstant(loc, i32Ty, 0);
+
+ // Call _FortranAAssign to copy the array data.
+ fir::CallOp::create(builder, loc, assignFunc,
+ mlir::ValueRange{dstConv, srcConv, nullLoc, zeroLine});
+
+ mlir::func::ReturnOp::create(builder, loc);
+ return funcOp;
+}
+
static bool isUserOutsideSR(Operation *user, Operation *parentOp,
SingleRegion sr) {
while (user->getParentOp() != parentOp)
@@ -380,11 +448,12 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
return alloc;
};
- auto moveToSingle =
- [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder,
- OpBuilder parallelBuilder) -> std::pair<bool, SmallVector<Value>> {
+ auto moveToSingle = [&](SingleRegion sr, OpBuilder allocaBuilder,
+ OpBuilder singleBuilder, OpBuilder parallelBuilder)
+ -> std::tuple<bool, SmallVector<Value>, SmallPtrSet<Value, 4>> {
IRMapping singleMapping = rootMapping;
SmallVector<Value> copyPrivate;
+ SmallPtrSet<Value, 4> boxDataCopyVars;
bool allParallelized = true;
// "firstprivate" pointer initialization creates: (1) alloca, (2) store
@@ -408,8 +477,8 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
writesToCopyprivateAlloca =
llvm::any_of(effects, [&](const auto &eff) {
return isa<MemoryEffects::Write>(eff.getEffect()) &&
- eff.getValue() &&
- hoistedCopyprivateAllocas.contains(eff.getValue());
+ (!eff.getValue() ||
+ hoistedCopyprivateAllocas.contains(eff.getValue()));
});
}
}
@@ -437,39 +506,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
} else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
if (alloca.isDynamic()) {
// Dynamic allocas (e.g. firstprivate arrays with runtime extent)
- // cannot use the simple load/store copyprivate copy function
- // because it only copies a single element for sequence types like
- // !fir.array<?xi32>. Instead, keep the alloca in the single block
- // and broadcast its address via a box to all threads. The box
- // preserves shape information and is semantically correct for
- // copyprivate.
- singleBuilder.clone(op, singleMapping);
+ // are hoisted so each thread gets its own allocation, providing
+ // true firstprivate semantics. The array data is broadcast via
+ // copyprivate using a box that carries shape information. The
+ // copyprivate copy function uses _FortranAAssign to copy the
+ // actual array data (not just the descriptor) between threads.
+ auto hoisted =
+ cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+ rootMapping.map(&*alloca, &*hoisted);
+ rootMapping.map(alloca.getResult(), hoisted.getResult());
+
if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
- Value clonedResult = singleMapping.lookup(alloca.getResult());
- if (!rootMapping.lookupOrNull(alloca.getResult())) {
- // Create a box type wrapping the allocated array type.
- Type eleTy =
- cast<fir::ReferenceType>(alloca.getType()).getEleTy();
- auto boxTy = fir::BoxType::get(eleTy);
- Value boxAlloc = fir::AllocaOp::create(allocaBuilder, loc, boxTy);
- // In single: create a shape from the alloca extents, embox
- // the array, and store the box.
- SmallVector<Value> extents;
- for (Value ext : alloca.getShape())
- extents.push_back(singleMapping.lookupOrDefault(ext));
- Value shape = fir::ShapeOp::create(singleBuilder, loc, extents);
- Value box = fir::EmboxOp::create(singleBuilder, loc, boxTy,
- clonedResult, shape);
- fir::StoreOp::create(singleBuilder, loc, box, boxAlloc);
- // After single: load the box and extract the address.
- Value loadedBox =
- fir::LoadOp::create(parallelBuilder, loc, boxTy, boxAlloc);
- Value addr = fir::BoxAddrOp::create(parallelBuilder, loc,
- alloca.getType(), loadedBox);
- rootMapping.map(alloca.getResult(), addr);
- copyPrivate.push_back(boxAlloc);
- }
+ // Create a box slot for copyprivate to broadcast the array data.
+ Type eleTy = cast<fir::ReferenceType>(alloca.getType()).getEleTy();
+ auto boxTy = fir::BoxType::get(eleTy);
+ Value boxAlloc = fir::AllocaOp::create(allocaBuilder, loc, boxTy);
+
+ // Embox the per-thread array allocation with its shape extents.
+ Value shape =
+ fir::ShapeOp::create(allocaBuilder, loc, hoisted.getShape());
+ Value box = fir::EmboxOp::create(allocaBuilder, loc, boxTy,
+ hoisted.getResult(), shape);
+ fir::StoreOp::create(allocaBuilder, loc, box, boxAlloc);
+
+ copyPrivate.push_back(boxAlloc);
+ boxDataCopyVars.insert(boxAlloc);
}
+
+ hoistedCopyprivateAllocas.insert(alloca.getResult());
allParallelized = false;
} else {
auto hoisted =
@@ -496,7 +560,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
}
}
omp::TerminatorOp::create(singleBuilder, loc);
- return {allParallelized, copyPrivate};
+ return {allParallelized, copyPrivate, boxDataCopyVars};
};
for (Block &block : sourceRegion) {
@@ -557,7 +621,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
Block *parallelBlock = new Block();
parallelBuilder.setInsertionPointToStart(parallelBlock);
- auto [allParallelized, copyprivateVars] =
+ auto [allParallelized, copyprivateVars, boxDataCopyVars] =
moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder,
singleBuilder, parallelBuilder);
if (allParallelized) {
@@ -574,7 +638,10 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
cleanupBlock(singleBlock);
for (auto var : singleOperands.copyprivateVars) {
mlir::func::FuncOp funcOp =
- createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
+ boxDataCopyVars.contains(var)
+ ? createBoxDataCopyFunc(loc, var.getType(),
+ firCopyFuncBuilder)
+ : createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
singleOperands.copyprivateSyms.push_back(
SymbolRefAttr::get(funcOp));
}
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index eb266a450e55d..b8c9dc13a1e90 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -9,7 +9,6 @@
!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | fir-opt --lower-workshare --allow-unregistered-dialect -o - | FileCheck %s --check-prefix FIROPT
! Test that parallel workshare with firstprivate(P) where P is a pointer
! correctly places stores through the pointer target in omp.single rather
@@ -87,80 +86,6 @@ subroutine test_workshare_firstprivate_pointer(P)
! FIR: }
! FIR: return
-! FIROPT: func.func @_QPtest_workshare_firstprivate_pointer(
-! FIROPT-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-
-! FIROPT: %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! FIROPT: %[[I_ALLOC:.*]] = fir.alloca i32
-! FIROPT: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_ALLOC]]
-
-! FIROPT: %[[P_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1
-! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
-
-! FIROPT: omp.parallel {
-
-! FIROPT: %[[P_PRIV:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", pinned
-
-! FIROPT: omp.single copyprivate(%[[P_PRIV]] -> @_workshare_copy_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) nowait {
-
-! FIROPT: %[[ZERO_PTR:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
-! FIROPT: %[[C0:.*]] = arith.constant 0 : index
-! FIROPT: %[[SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1>
-! FIROPT: %[[EMBOX:.*]] = fir.embox %[[ZERO_PTR]](%[[SHAPE]])
-! FIROPT: fir.store %[[EMBOX]] to %[[P_PRIV]]
-
-! FIROPT: %[[P_FP:.*]]:2 = hlfir.declare %[[P_PRIV]]
-! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
-
-! FIROPT: %[[LOAD_ORIG:.*]] = fir.load %[[P_DECL]]#0
-! FIROPT: fir.store %[[LOAD_ORIG]] to %[[P_FP]]#0
-
-! FIROPT: %[[C1:.*]] = arith.constant 1 : i32
-
-! FIROPT: %[[LOAD_PRIV:.*]] = fir.load %[[P_FP]]#0
-! FIROPT: %[[C0_2:.*]] = arith.constant 0 : index
-! FIROPT: %[[DIMS:.*]]:3 = fir.box_dims %[[LOAD_PRIV]], %[[C0_2]]
-
-! FIROPT: %[[EXT64:.*]] = fir.convert %[[DIMS]]#1 : (index) -> i64
-! FIROPT: %[[EXT32:.*]] = fir.convert %[[EXT64]] : (i64) -> i32
-
-! FIROPT: hlfir.forall lb {
-! FIROPT: hlfir.yield %[[C1]] : i32
-! FIROPT: } ub {
-! FIROPT: hlfir.yield %[[EXT32]] : i32
-! FIROPT: } (%[[IV:.*]]: i32) {
-
-! FIROPT: %[[IDX:.*]] = hlfir.forall_index "i" %[[IV]] : (i32) -> !fir.ref<i32>
-
-! FIROPT: hlfir.region_assign {
-
-! FIROPT: %[[IDX_VAL:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
-! FIROPT: hlfir.yield %[[IDX_VAL]] : i32
-
-! FIROPT: } to {
-
-! FIROPT: %[[LOAD_BOX:.*]] = fir.load %[[P_FP]]#0
-! FIROPT: %[[LOAD_I:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
-! FIROPT: %[[IDX64:.*]] = fir.convert %[[LOAD_I]] : (i32) -> i64
-
-! FIROPT: %[[DESIG:.*]] = hlfir.designate %[[LOAD_BOX]] (%[[IDX64]])
-! FIROPT-SAME: -> !fir.ref<i32>
-
-! FIROPT: hlfir.yield %[[DESIG]] : !fir.ref<i32>
-
-! FIROPT: }
-! FIROPT: }
-
-! FIROPT: omp.terminator
-! FIROPT: }
-
-! FIROPT: %[[POST_DECL:.*]]:2 = hlfir.declare %[[P_PRIV]]
-! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
-
-! FIROPT: omp.barrier
-! FIROPT: omp.terminator
-! FIROPT: }
-
! At LLVM IR level, verify the OpenMP fork call exists and the loop body
! is inside the outlined function.
! LLVM: call void {{.*}}__kmpc_fork_call({{.*}}@test_workshare_firstprivate_pointer_..omp_par{{.*}})
@@ -195,31 +120,29 @@ subroutine test_workshare_firstprivate_array(a, z, n)
end subroutine
! After workshare lowering, the dynamic alloca for the firstprivate copy
-! must be inside omp.single, with its address broadcast via a !fir.box
-! indirection alloca + copyprivate.
+! is hoisted so each thread gets its own allocation (true firstprivate).
+! The array data is broadcast via copyprivate using a box with a
+! data-copying function (_FortranAAssign).
! FIR-LABEL: {{.*}}test_workshare_firstprivate_array(
! FIR: %[[C1:.*]] = arith.constant 1 : i32
! FIR-LABEL: omp.parallel {
-! The box indirection alloca is hoisted for copyprivate
-! FIR: omp.single copyprivate(%[[BOX_INDIRECT:.*]] -> @_workshare_copy_box_Uxi32{{.*}}) {
-
-! The dynamic alloca (firstprivate copy) is inside the single block
-! FIR: %[[FP_ARRAY:.*]] = fir.alloca{{.*}}
+! The dynamic alloca is hoisted (per-thread allocation)
+! FIR: %[[FP_ARRAY:.*]] = fir.alloca !fir.array<?xi32>
+! The box slot and embox are hoisted for copyprivate
+! FIR: %[[BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! FIR: %[[SHAPE:.*]] = fir.shape %{{.*}}
+! FIR: %[[BOX_VAL:.*]] = fir.embox %[[FP_ARRAY]](%[[SHAPE]]){{.*}}
+! FIR: fir.store %[[BOX_VAL]] to %[[BOX_SLOT]]
-! Runtime shape construction for the firstprivate array.
-! FIR: %[[SHAPE:.*]] = fir.shape %{{.*}}
-! FIR: %[[BOX_VAL:.*]] = fir.embox %[[FP_ARRAY]](%[[SHAPE]]){{.*}}fir.array<?xi32>{{.*}}
-! FIR: fir.store %[[BOX_VAL]] to %[[BOX_INDIRECT]] {{.*}}fir.array<?xi32>{{.*}}
+! Copyprivate uses box-data copy function to broadcast array contents
+! FIR: omp.single copyprivate(%[[BOX_SLOT]] -> @_workshare_copy_data_box_Uxi32{{.*}}) {
-! The initialization of the firstprivate copy
+! The initialization of the firstprivate copy (single thread only)
! FIR: fir.call @_FortranAAssign
! FIR: omp.terminator
! FIR: }
-! After single, the box is loaded and the address extracted
-! FIR: %[[LOADED_BOX:.*]] = fir.load %[[BOX_INDIRECT]]{{.*}}fir.array<?xi32>{{.*}}
-! FIR: %[[ARRAY_ADDR:.*]] = fir.box_addr %[[LOADED_BOX]]{{.*}}fir.array<?xi32>>{{.*}}
-! The workshared loop uses the broadcast address
+! The workshared loop uses the per-thread allocation directly
! FIR: omp.wsloop {
! FIR: %[[SRC_ELEM:.*]] = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}}
! FIR: %[[SRC_VAL:.*]] = fir.load %[[SRC_ELEM]]
@@ -230,3 +153,264 @@ subroutine test_workshare_firstprivate_array(a, z, n)
! FIR: omp.barrier
! FIR: omp.terminator
! FIR: return
+
+subroutine allocatable_example()
+ implicit none
+
+ integer, allocatable :: p(:)
+ integer :: a(4)
+
+ allocate(p(4))
+ p = [1, 2, 3, 4]
+
+ !$omp parallel workshare firstprivate(p)
+ a = p + 1
+ !$omp end parallel workshare
+end subroutine
+
+! HLFIR-LABEL: func.func @_QPallocatable_example() {
+! HLFIR: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {{.*}}uniq_name = "_QFallocatable_exampleEa"
+! HLFIR: %[[ORIG_P:.*]]:2 = hlfir.declare %{{.*}} {{.*}}fortran_attrs = #fir.var_attrs<allocatable>{{.*}}uniq_name = "_QFallocatable_exampleEp"
+
+! Initial allocation/assignment of original p
+! HLFIR: fir.allocmem !fir.array<?xi32>
+! HLFIR: fir.store %{{.*}} to %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: hlfir.assign %{{.*}} to %[[ORIG_P]]#0 realloc : {{.*}}, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+
+! HLFIR: omp.parallel {
+! HLFIR: omp.workshare {
+
+! Firstprivate allocatable descriptor
+! HLFIR: %[[FP_ALLOCA:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {{.*}}bindc_name = "p"{{.*}}
+
+! Allocate/init firstprivate copy depending on original allocation status
+! HLFIR: %[[ORIG_VAL0:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[ORIG_ADDR0:.*]] = fir.box_addr %[[ORIG_VAL0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.convert %[[ORIG_ADDR0]]
+! HLFIR: arith.cmpi ne
+! HLFIR: fir.if %{{.*}} {
+! HLFIR: %[[ORIG_VAL1:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: fir.box_dims %[[ORIG_VAL1]]
+! HLFIR: fir.allocmem !fir.array<?xi32>
+! HLFIR: fir.embox
+! HLFIR: fir.store %{{.*}} to %[[FP_ALLOCA]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: } else {
+! HLFIR: fir.zero_bits !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.embox
+! HLFIR: fir.store %{{.*}} to %[[FP_ALLOCA]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: }
+
+! Declare firstprivate p
+! HLFIR: %[[FP_DECL:.*]]:2 = hlfir.declare %[[FP_ALLOCA]] {{.*}}fortran_attrs = #fir.var_attrs<allocatable>{{.*}}uniq_name = "_QFallocatable_exampleEp"
+
+! Copy original p into firstprivate p
+! HLFIR: %[[FP_VAL0:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[FP_ADDR0:.*]] = fir.box_addr %[[FP_VAL0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.convert %[[FP_ADDR0]]
+! HLFIR: arith.cmpi ne
+! HLFIR: fir.if %{{.*}} {
+! HLFIR: %[[ORIG_VAL2:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: hlfir.assign %[[ORIG_VAL2]] to %[[FP_DECL]]#0 realloc : !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: }
+
+! Use firstprivate p in: a = p + 1
+! HLFIR: %[[FP_VAL1:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[EXPR:.*]] = hlfir.elemental
+! HLFIR: hlfir.designate %[[FP_VAL1]]
+! HLFIR: fir.load
+! HLFIR: arith.addi
+! HLFIR: hlfir.yield_element
+! HLFIR: hlfir.assign %[[EXPR]] to %[[A]]#0 : !hlfir.expr<?xi32>, !fir.ref<!fir.array<4xi32>>
+! HLFIR: hlfir.destroy %[[EXPR]] : !hlfir.expr<?xi32>
+
+! Cleanup firstprivate p
+! HLFIR: %[[FP_VAL2:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[FP_ADDR1:.*]] = fir.box_addr %[[FP_VAL2]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.convert %[[FP_ADDR1]]
+! HLFIR: arith.cmpi ne
+! HLFIR: fir.if %{{.*}} {
+! HLFIR: %[[FP_VAL3:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[FP_ADDR2:.*]] = fir.box_addr %[[FP_VAL3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.freemem %[[FP_ADDR2]] : !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.store %{{.*}} to %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: }
+! HLFIR: omp.terminator
+! HLFIR: }
+! HLFIR: omp.terminator
+! HLFIR: }
+
+! Final cleanup of original p
+! HLFIR: %[[ORIG_VAL3:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[ORIG_ADDR1:.*]] = fir.box_addr %[[ORIG_VAL3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.convert %[[ORIG_ADDR1]]
+! HLFIR: arith.cmpi ne
+! HLFIR: fir.if %{{.*}} {
+! HLFIR: %[[ORIG_VAL4:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[ORIG_ADDR2:.*]] = fir.box_addr %[[ORIG_VAL4]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.freemem %[[ORIG_ADDR2]] : !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.store %{{.*}} to %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: }
+
+! HLFIR: return
+! HLFIR: }
+
+! FIR-LABEL: func.func @_QPallocatable_example()
+
+! FIR: %[[C1_I32:.*]] = arith.constant 1 : i32
+
+! Original allocatable p declaration/allocation
+! FIR: %[[A_DECL:.*]] = fir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFallocatable_exampleEa"}
+! FIR: %[[ORIG_P:.*]] = fir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatable_exampleEp"}
+! FIR: fir.allocmem !fir.array<?xi32>
+! FIR: fir.store %{{.*}} to %[[ORIG_P]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR: fir.call @_FortranAAssign
+
+! FIR: omp.parallel {
+
+! Allocas for copyprivate slots
+! FIR: %[[A_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<4xi32>>
+! FIR: %[[FP_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "p", pinned
+! FIR: %[[COPY_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
+! FIR: %[[COPY_HEAP_SLOT:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
+
+! Copyprivate with three slots for broadcasting firstprivate data
+! FIR: omp.single copyprivate(%[[FP_BOX_SLOT]] -> @_workshare_copy_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %[[COPY_BOX_SLOT]] -> @_workshare_copy_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %[[COPY_HEAP_SLOT]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>) {
+
+! Check original allocation status and allocate firstprivate copy
+! FIR: %[[ORIG_BOX0:.*]] = fir.load %[[ORIG_P]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR: fir.box_addr %[[ORIG_BOX0]]
+! FIR: arith.cmpi ne
+! FIR: fir.if %{{.*}} {
+! FIR: fir.load %[[ORIG_P]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR: fir.box_dims
+! FIR: fir.allocmem !fir.array<?xi32>
+! FIR: fir.store %{{.*}} to %[[FP_BOX_SLOT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR: } else {
+! FIR: fir.zero_bits !fir.heap<!fir.array<?xi32>>
+! FIR: fir.store %{{.*}} to %[[FP_BOX_SLOT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR: }
+
+! Declare firstprivate p and copy data from original
+! FIR: %[[FP_DECL:.*]] = fir.declare %[[FP_BOX_SLOT]] {fortran_attrs = #fir.var_attrs<allocatable>
+! FIR: fir.load %[[FP_DECL]]
+! FIR: fir.box_addr
+! FIR: arith.cmpi ne
+! FIR: fir.if %{{.*}} {
+! FIR: fir.call @_FortranAAssign
+! FIR: }
+
+! Store to copyprivate broadcast slots
+! FIR: fir.store %{{.*}} to %[[COPY_BOX_SLOT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR: fir.store %{{.*}} to %[[COPY_HEAP_SLOT]] : !fir.ref<!fir.heap<!fir.array<?xi32>>>
+! FIR: omp.terminator
+! FIR: }
+
+! After single: use copyprivate broadcast data for workshared computation
+! FIR: %[[FP_DECL2:.*]] = fir.declare %[[FP_BOX_SLOT]] {fortran_attrs = #fir.var_attrs<allocatable>
+! FIR: %[[COPY_BOX:.*]] = fir.load %[[COPY_BOX_SLOT]]
+
+! Workshared loop: temp = p + 1
+! FIR: omp.wsloop {
+! FIR: omp.loop_nest (%[[I:.*]]) : index
+! FIR: %[[SRC_VAL:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! FIR: %[[ADD_RES:.*]] = arith.addi %[[SRC_VAL]], %[[C1_I32]] : i32
+! FIR: fir.store %[[ADD_RES]] to %{{.*}} : !fir.ref<i32>
+! FIR: omp.yield
+! FIR: }
+
+! Assignment of temp to a and cleanup (in omp.single nowait)
+! FIR: omp.single nowait {
+! FIR: fir.call @_FortranAAssign
+! FIR: fir.freemem
+! Cleanup firstprivate p
+! FIR: fir.if %{{.*}} {
+! FIR: fir.freemem
+! FIR: }
+! FIR: omp.terminator
+! FIR: }
+
+! FIR: omp.barrier
+! FIR: omp.terminator
+! FIR: }
+
+! Cleanup original p
+! FIR: fir.if %{{.*}} {
+! FIR: fir.freemem
+! FIR: }
+! FIR: return
+
+subroutine derived_type_example()
+ implicit none
+
+ type :: t
+ integer :: x
+ end type
+
+ type(t) :: p(4)
+ integer :: a(4)
+
+ p%x = [1, 2, 3, 4]
+
+ !$omp parallel workshare firstprivate(p)
+ a = p%x + 1
+ !$omp end parallel workshare
+end subroutine
+
+! FIR-LABEL: func.func @_QPderived_type_example()
+! FIR: %[[C1_I32:.*]] = arith.constant 1 : i32
+! FIR: %[[A_DECL:.*]] = fir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFderived_type_exampleEa"}
+! FIR: %[[ORIG_P:.*]] = fir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFderived_type_exampleEp"}
+! FIR: fir.call @_FortranAAssign
+! FIR: omp.parallel {
+
+! Allocas for copyprivate slots
+! FIR: %[[A_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<4xi32>>
+! FIR: %[[P_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<4x!fir.type<{{.*}}>>>
+! FIR: %[[FP_ARRAY:.*]] = fir.alloca !fir.array<4x!fir.type<{{.*}}>> {bindc_name = "p", pinned
+! FIR: %[[HEAP_SLOT:.*]] = fir.alloca !fir.heap<!fir.array<4xi32>>
+
+! Copyprivate with derived-type copy function and heap copy function
+! FIR: omp.single copyprivate(%[[FP_ARRAY]] -> @_workshare_copy_4xrec__QFderived_type_exampleTt : {{.*}}, %[[HEAP_SLOT]] -> @_workshare_copy_heap_4xi32 : {{.*}}) {
+
+! Declare firstprivate p and copy original data
+! FIR: fir.declare %[[FP_ARRAY]]
+! FIR: fir.call @_FortranAAssign
+
+! Allocate temp array for expression result
+! FIR: fir.allocmem !fir.array<4xi32>
+! FIR: fir.store %{{.*}} to %[[HEAP_SLOT]]
+! FIR: omp.terminator
+! FIR: }
+
+! After single: declare firstprivate p and extract p%x via slice
+! FIR: %[[FP_DECL:.*]] = fir.declare %[[FP_ARRAY]]
+! FIR: fir.field_index x, !fir.type<{{.*}}>
+! FIR: fir.slice
+! FIR: %[[PX_ADDR:.*]] = fir.box_addr
+
+! Load temp array from copyprivate slot
+! FIR: %[[HEAP_VAL:.*]] = fir.load %[[HEAP_SLOT]]
+! FIR: %[[TMP_DECL:.*]] = fir.declare %[[HEAP_VAL]]
+
+! Workshared loop: temp = p%x + 1
+! FIR: omp.wsloop {
+! FIR: omp.loop_nest (%[[I:.*]]) : index
+! FIR: %[[SRC_ELEM:.*]] = fir.array_coor %[[PX_ADDR]]
+! FIR: %[[SRC_VAL:.*]] = fir.load %[[SRC_ELEM]] : !fir.ref<i32>
+! FIR: %[[ADD_RES:.*]] = arith.addi %[[SRC_VAL]], %[[C1_I32]] : i32
+! FIR: %[[DST_ELEM:.*]] = fir.array_coor %[[TMP_DECL]]
+! FIR: fir.store %[[ADD_RES]] to %[[DST_ELEM]] : !fir.ref<i32>
+! FIR: omp.yield
+
+! Assignment of temp to a and cleanup (in omp.single nowait)
+! FIR: omp.single nowait {
+! FIR: fir.call @_FortranAAssign
+! FIR: fir.freemem
+! FIR: omp.terminator
+! FIR: }
+
+! FIR: omp.barrier
+! FIR: omp.terminator
+! FIR: }
+
+! FIR: return
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index e8cb9065123e7..124077a9b92ac 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -488,5 +488,76 @@ func.func @pointer_descriptor_store_is_thread_local() {
// Test for "parallel workshare firstprivate(z)" where z is an array.
// Check that z is broadcast to all private values of the threads.
-// This test is now part of flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+// The copy function uses _FortranAAssign to broadcast array data.
+// CHECK-LABEL: func.func private @_workshare_copy_data_box_Uxi32(
+// CHECK: fir.call @_FortranAAssign
+// CHECK: return
+
+// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array(
+// CHECK-SAME: %[[N:.*]]: index,
+// CHECK-SAME: %[[SRC:.*]]: !fir.ref<!fir.array<?xi32>>,
+// CHECK-SAME: %[[DST:.*]]: !fir.ref<!fir.array<?xi32>>)
+func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
+ omp.parallel {
+ omp.workshare {
+ // Dynamic alloca for the firstprivate array copy
+ %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
+ %shape = fir.shape %n : (index) -> !fir.shape<1>
+ %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+ // A side-effecting op that initializes the firstprivate copy
+ "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+ // Workshared loop that reads from the firstprivate array
+ %c1 = arith.constant 1 : index
+ omp.workshare.loop_wrapper {
+ omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
+ %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ %val = fir.load %elem : !fir.ref<i32>
+ %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+ fir.store %val to %dst_elem : !fir.ref<i32>
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ return
+}
+
+// CHECK: omp.parallel {
+
+// The dynamic alloca is hoisted so each thread gets its own allocation.
+// CHECK: %[[ARRAY:.*]] = fir.alloca !fir.array<?xi32>, %[[N]]
+// CHECK-SAME: bindc_name = "z"
+// CHECK-SAME: pinned
+
+// A box slot is used for copyprivate to broadcast the array data.
+// CHECK: %[[BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+// CHECK: %[[SHAPE0:.*]] = fir.shape %[[N]]
+// CHECK: %[[BOX:.*]] = fir.embox %[[ARRAY]](%[[SHAPE0]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK: fir.store %[[BOX]] to %[[BOX_SLOT]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+
+// Initialization happens only inside the single block.
+// CHECK: omp.single copyprivate(%[[BOX_SLOT]] -> @_workshare_copy_data_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
+// CHECK: %[[SHAPE1:.*]] = fir.shape %[[N]]
+// CHECK: %[[DECL1:.*]] = fir.declare %[[ARRAY]](%[[SHAPE1]])
+// CHECK: "test.init"(%[[DECL1]], %[[SRC]])
+// CHECK: omp.terminator
+// CHECK: }
+
+// The workshared loop uses the hoisted per-thread array.
+// CHECK: %[[SHAPE2:.*]] = fir.shape %[[N]]
+// CHECK: %[[DECL2:.*]] = fir.declare %[[ARRAY]](%[[SHAPE2]])
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: omp.wsloop nowait
+// CHECK: omp.loop_nest (%[[I:.*]]) : index = (%[[C1]]) to (%[[N]]) inclusive step (%[[C1]])
+// CHECK: %[[ELEM:.*]] = fir.array_coor %[[DECL2]](%[[SHAPE2]]) %[[I]] : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK: %[[VAL:.*]] = fir.load %[[ELEM]] : !fir.ref<i32>
+// CHECK: %[[DST_ELEM:.*]] = fir.array_coor %[[DST]](%[[SHAPE2]]) %[[I]] : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK: fir.store %[[VAL]] to %[[DST_ELEM]] : !fir.ref<i32>
+// CHECK: omp.yield
+
+// CHECK: omp.barrier
+// CHECK: omp.terminator
+// CHECK: return
>From f72cc63c615cdbfcd46838625dadc0033556f332 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 3 Jun 2026 06:07:15 -0500
Subject: [PATCH 6/6] Using fir::runtime::genAssign instead of explicit call to
_FortranAAssign. This helps in inlining code at higher optimization (-O1, -O2
etc) builds.
---
flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 31 +++----------------
1 file changed, 5 insertions(+), 26 deletions(-)
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index 50463810d5025..7c8a3059329bb 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -18,6 +18,7 @@
#include <flang/Optimizer/Analysis/AliasAnalysis.h>
#include <flang/Optimizer/Builder/FIRBuilder.h>
+#include <flang/Optimizer/Builder/Runtime/Assign.h>
#include <flang/Optimizer/Dialect/FIROps.h>
#include <flang/Optimizer/Dialect/FIRType.h>
#include <flang/Optimizer/HLFIR/HLFIROps.h>
@@ -318,23 +319,6 @@ static mlir::func::FuncOp createBoxDataCopyFunc(mlir::Location loc,
if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
return decl;
- // Ensure _FortranAAssign is declared in the module.
- auto boxNoneTy = fir::BoxType::get(builder.getNoneType());
- auto refBoxNoneTy = fir::ReferenceType::get(boxNoneTy);
- auto refI8Ty = fir::ReferenceType::get(builder.getIntegerType(8));
- auto i32Ty = builder.getI32Type();
- llvm::StringRef assignFuncName = "_FortranAAssign";
- auto assignFunc = module.lookupSymbol<mlir::func::FuncOp>(assignFuncName);
- if (!assignFunc) {
- mlir::OpBuilder::InsertionGuard g(builder);
- mlir::OpBuilder modBuilder(module.getBodyRegion());
- auto assignFuncType = mlir::FunctionType::get(
- builder.getContext(), {refBoxNoneTy, boxNoneTy, refI8Ty, i32Ty}, {});
- assignFunc = mlir::func::FuncOp::create(modBuilder, loc, assignFuncName,
- assignFuncType);
- assignFunc.setVisibility(mlir::SymbolTable::Visibility::Private);
- }
-
// Create the copy function.
mlir::OpBuilder::InsertionGuard guard(builder);
mlir::OpBuilder modBuilder(module.getBodyRegion());
@@ -348,22 +332,17 @@ static mlir::func::FuncOp createBoxDataCopyFunc(mlir::Location loc,
{loc, loc});
builder.setInsertionPointToStart(&funcOp.getRegion().back());
- // Load the source box.
+ // Load the source box and use the runtime helper to generate the assign.
Value srcBox =
fir::LoadOp::create(builder, loc, eleTy, funcOp.getArgument(1));
- // Convert types for _FortranAAssign call.
+ auto boxNoneTy = fir::BoxType::get(builder.getNoneType());
+ auto refBoxNoneTy = fir::ReferenceType::get(boxNoneTy);
Value dstConv =
fir::ConvertOp::create(builder, loc, refBoxNoneTy, funcOp.getArgument(0));
Value srcConv = fir::ConvertOp::create(builder, loc, boxNoneTy, srcBox);
- // Use null source location (only used for error reporting).
- Value nullLoc = fir::ZeroOp::create(builder, loc, refI8Ty);
- Value zeroLine = builder.createIntegerConstant(loc, i32Ty, 0);
-
- // Call _FortranAAssign to copy the array data.
- fir::CallOp::create(builder, loc, assignFunc,
- mlir::ValueRange{dstConv, srcConv, nullLoc, zeroLine});
+ fir::runtime::genAssign(builder, loc, dstConv, srcConv);
mlir::func::ReturnOp::create(builder, loc);
return funcOp;
More information about the flang-commits
mailing list