[flang-commits] [flang] [flang][OpenMP] Fix incorrect handling for local variable in OpenMP parallel workshare firstprivate(P) (PR #195616)

via flang-commits flang-commits at lists.llvm.org
Mon May 4 01:59:46 PDT 2026


llvmorg-github-actions[bot] wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-flang-openmp

Author: SunilKuravinakop

<details>
<summary>Changes</summary>

Changes to handle "!$omp parallel workshare firstprivate(P)" where P is an array. Handling the creation and initialization of the local copy properly.

This handles issue [195337](https://github.com/llvm/llvm-project/issues/195337) .

---
Full diff: https://github.com/llvm/llvm-project/pull/195616.diff


3 Files Affected:

- (modified) flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp (+31-1) 
- (added) flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 (+64) 
- (modified) flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir (+80) 


``````````diff
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index a41d8d8826501..cf51cb887622f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -156,6 +156,11 @@ static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
   fir::AliasAnalysis aliasAnalysis;
   fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
 
+  // With firstprivate(P) where P is a pointer, each thread gets its own copy
+  // of the descriptor, but P(i) accesses shared target data.
+  if (source.accessPath.hasPointerDeref())
+    return false;
+
   // Check if the source is a Value (not a global symbol).
   mlir::Value sourceValue =
       llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
@@ -370,10 +375,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
     SmallVector<Value> copyPrivate;
     bool allParallelized = true;
 
+    // "firstprivate" pointer initialization creates: (1) alloca, (2) store
+    // null box, (3) copy original. If step (2) is duplicated into the
+    // parallel block, it runs after initialization of the private copy and
+    // overwrites the pointer descriptor with null, causing a segfault on
+    // dereference.
+    SmallPtrSet<Value, 4> hoistedCopyprivateAllocas;
+
     for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
       if (isSafeToParallelize(&op)) {
         singleBuilder.clone(op, singleMapping);
-        if (llvm::all_of(op.getOperands(), [&](Value opr) {
+        // Check if this operation writes to a hoisted copyprivate alloca.
+        // Such stores must stay only in the single block; the copyprivate
+        // mechanism handles broadcasting the final value to all threads.
+        bool writesToCopyprivateAlloca = false;
+        if (!hoistedCopyprivateAllocas.empty()) {
+          if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(&op)) {
+            SmallVector<MemoryEffects::EffectInstance> effects;
+            memEffects.getEffects(effects);
+            writesToCopyprivateAlloca =
+                llvm::any_of(effects, [&](const auto &eff) {
+                  return isa<MemoryEffects::Write>(eff.getEffect()) &&
+                         eff.getValue() &&
+                         hoistedCopyprivateAllocas.contains(eff.getValue());
+                });
+          }
+        }
+        if (!writesToCopyprivateAlloca &&
+            llvm::all_of(op.getOperands(), [&](Value opr) {
               // Either we have already remapped it
               bool remapped = rootMapping.contains(opr);
               // Or it is available because it dominates `sr`
@@ -399,6 +428,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
         rootMapping.map(&*alloca, &*hoisted);
         rootMapping.map(alloca.getResult(), hoisted.getResult());
         copyPrivate.push_back(hoisted);
+        hoistedCopyprivateAllocas.insert(alloca.getResult());
         allParallelized = false;
       } else {
         singleBuilder.clone(op, singleMapping);
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
new file mode 100644
index 0000000000000..5e08c2dd161ec
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -0,0 +1,64 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+
+! Test that parallel workshare with firstprivate(P) where P is a pointer
+! correctly places stores through the pointer target in omp.single rather
+! than parallelizing them. The pointer descriptor is thread-local (firstprivate),
+! but the target data is shared memory.
+
+subroutine test_workshare_firstprivate_pointer(P)
+  integer, pointer, intent(in) :: P(:)
+  integer :: i
+  !$omp parallel workshare firstprivate(P)
+  forall (i = 1:SIZE(P)) P(i) = i
+  !$omp end parallel workshare
+end subroutine
+
+! HLFIR:     omp.parallel {
+! HLFIR:       omp.workshare {
+! The firstprivate copy: alloca, zero-init, declare, then copy from original
+! HLFIR:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR:         fir.store
+! HLFIR:         hlfir.declare
+! HLFIR:         fir.load
+! HLFIR:         fir.store
+! HLFIR:         hlfir.forall
+! HLFIR:         omp.terminator
+! HLFIR:       }
+! HLFIR:       omp.terminator
+! HLFIR:     }
+
+! After workshare lowering, the forall body (which stores through the pointer
+! target) must be inside omp.single, not parallelized.
+! FIR:     omp.parallel {
+! FIR:       %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! The firstprivate init + copy and the forall loop must be in omp.single
+! FIR:       omp.single copyprivate(%[[DESC]]
+! FIR:         fir.store
+! FIR:         fir.declare
+! FIR:         fir.load
+! FIR:         fir.store
+! The forall loop accesses pointer target (shared memory) - must stay in single
+! FIR:         fir.do_loop
+! FIR:           fir.array_coor
+! FIR:           fir.store
+! FIR:         omp.terminator
+! FIR:       }
+! FIR:       omp.barrier
+! FIR:       omp.terminator
+
+! At LLVM IR level, verify the OpenMP fork call exists and the loop body
+! is inside the outlined function.
+! LLVM:     call void {{.*}}__kmpc_fork_call
+! LLVM:     define internal void @test_workshare_firstprivate_pointer_..omp_par
+! The single construct must be present in the outlined function
+! LLVM:       call i32 @__kmpc_single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d6000c989515b..12bae176b70d2 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -403,3 +403,83 @@ func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
 // CHECK:         }
 // CHECK:         omp.barrier
 // CHECK:       }
+
+
+// Check that a store through a pointer dereference is NOT considered
+// thread-local, even if the pointer descriptor itself is in a thread-local
+// alloca. This models the "parallel workshare firstprivate(P)" case where P
+// is a Fortran POINTER: each thread gets its own copy of the descriptor, but
+// P(i) accesses shared target data through the pointer.
+
+// CHECK-LABEL: func.func @pointer_deref_not_thread_local
+func.func @pointer_deref_not_thread_local() {
+  omp.parallel {
+    // Thread-local alloca for the pointer descriptor (models firstprivate)
+    %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    %decl = fir.declare %desc {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "p"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+    omp.workshare {
+      // Load the pointer box and access the target data via array_coor.
+      // Even though %desc is thread-local, the target data is shared.
+      %box = fir.load %decl : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %c0 = arith.constant 0 : index
+      %dims:3 = fir.box_dims %box, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+      %shift = fir.shift %dims#0 : (index) -> !fir.shift<1>
+      %c1_i64 = arith.constant 1 : i64
+      %elem = fir.array_coor %box(%shift) %c1_i64 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      %c42 = arith.constant 42 : i32
+      // This store goes to shared target data (through pointer deref),
+      // so it MUST be in omp.single, not parallelized.
+      fir.store %c42 to %elem : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The store through the pointer dereference must be inside omp.single.
+// CHECK:       omp.parallel {
+// CHECK:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:         omp.single
+// CHECK:           fir.load
+// CHECK:           fir.box_dims
+// CHECK:           fir.array_coor
+// CHECK:           fir.store
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK:         omp.barrier
+// CHECK:       }
+
+
+// Check that a direct store to the pointer descriptor alloca (not through
+// the pointer target) IS still recognized as thread-local.
+
+// CHECK-LABEL: func.func @pointer_descriptor_store_is_thread_local
+func.func @pointer_descriptor_store_is_thread_local() {
+  omp.parallel {
+    %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    omp.workshare {
+      %null = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+      %c0 = arith.constant 0 : index
+      %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+      %box = fir.embox %null(%shape) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+      // This store updates the descriptor itself (thread-local alloca),
+      // NOT the pointer target, so it should be parallelized.
+      fir.store %box to %desc : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The store to the descriptor alloca is thread-local and should NOT be in omp.single.
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:         fir.zero_bits
+// CHECK:         fir.shape
+// CHECK:         fir.embox
+// CHECK:         fir.store {{.*}} to %[[DESC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }

``````````

</details>


https://github.com/llvm/llvm-project/pull/195616


More information about the flang-commits mailing list