[flang-commits] [flang] [flang][OpenMP] Fix incorrect handling for local variable in OpenMP parallel workshare firstprivate(P) (PR #195616)

Tue May 5 01:58:17 PDT 2026

https://github.com/SunilKuravinakop updated https://github.com/llvm/llvm-project/pull/195616

>From dfb65ee076ec5787675df968edc03cc0021b1892 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 4 May 2026 03:39:17 -0500
Subject: [PATCH 1/3] Changes to handle "!$omp parallel workshare
 firstprivate(P)" where P is an array. Handling the creation and
 initialization of the local copy properly.

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 32 +++++++-
 .../OpenMP/workshare-firstprivate-pointer.f90 | 64 +++++++++++++++
 .../OpenMP/lower-workshare-thread-local.mlir  | 80 +++++++++++++++++++
 3 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index a41d8d8826501..cf51cb887622f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -156,6 +156,11 @@ static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
   fir::AliasAnalysis aliasAnalysis;
   fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
 
+  // With firstprivate(P) where P is a pointer, each thread gets its own copy
+  // of the descriptor, but P(i) accesses shared target data.
+  if (source.accessPath.hasPointerDeref())
+    return false;
+
   // Check if the source is a Value (not a global symbol).
   mlir::Value sourceValue =
       llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
@@ -370,10 +375,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
     SmallVector<Value> copyPrivate;
     bool allParallelized = true;
 
+    // "firstprivate" pointer initialization creates: (1) alloca, (2) store
+    // null box, (3) copy original. If step (2) is duplicated into the
+    // parallel block, it runs after initialization of the private copy and
+    // overwrites the pointer descriptor with null, causing a segfault on
+    // dereference.
+    SmallPtrSet<Value, 4> hoistedCopyprivateAllocas;
+
     for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
       if (isSafeToParallelize(&op)) {
         singleBuilder.clone(op, singleMapping);
-        if (llvm::all_of(op.getOperands(), [&](Value opr) {
+        // Check if this operation writes to a hoisted copyprivate alloca.
+        // Such stores must stay only in the single block; the copyprivate
+        // mechanism handles broadcasting the final value to all threads.
+        bool writesToCopyprivateAlloca = false;
+        if (!hoistedCopyprivateAllocas.empty()) {
+          if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(&op)) {
+            SmallVector<MemoryEffects::EffectInstance> effects;
+            memEffects.getEffects(effects);
+            writesToCopyprivateAlloca =
+                llvm::any_of(effects, [&](const auto &eff) {
+                  return isa<MemoryEffects::Write>(eff.getEffect()) &&
+                         eff.getValue() &&
+                         hoistedCopyprivateAllocas.contains(eff.getValue());
+                });
+          }
+        }
+        if (!writesToCopyprivateAlloca &&
+            llvm::all_of(op.getOperands(), [&](Value opr) {
               // Either we have already remapped it
               bool remapped = rootMapping.contains(opr);
               // Or it is available because it dominates `sr`
@@ -399,6 +428,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
         rootMapping.map(&*alloca, &*hoisted);
         rootMapping.map(alloca.getResult(), hoisted.getResult());
         copyPrivate.push_back(hoisted);
+        hoistedCopyprivateAllocas.insert(alloca.getResult());
         allParallelized = false;
       } else {
         singleBuilder.clone(op, singleMapping);
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
new file mode 100644
index 0000000000000..5e08c2dd161ec
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -0,0 +1,64 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+
+! Test that parallel workshare with firstprivate(P) where P is a pointer
+! correctly places stores through the pointer target in omp.single rather
+! than parallelizing them. The pointer descriptor is thread-local (firstprivate),
+! but the target data is shared memory.
+
+subroutine test_workshare_firstprivate_pointer(P)
+  integer, pointer, intent(in) :: P(:)
+  integer :: i
+  !$omp parallel workshare firstprivate(P)
+  forall (i = 1:SIZE(P)) P(i) = i
+  !$omp end parallel workshare
+end subroutine
+
+! HLFIR:     omp.parallel {
+! HLFIR:       omp.workshare {
+! The firstprivate copy: alloca, zero-init, declare, then copy from original
+! HLFIR:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR:         fir.store
+! HLFIR:         hlfir.declare
+! HLFIR:         fir.load
+! HLFIR:         fir.store
+! HLFIR:         hlfir.forall
+! HLFIR:         omp.terminator
+! HLFIR:       }
+! HLFIR:       omp.terminator
+! HLFIR:     }
+
+! After workshare lowering, the forall body (which stores through the pointer
+! target) must be inside omp.single, not parallelized.
+! FIR:     omp.parallel {
+! FIR:       %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! The firstprivate init + copy and the forall loop must be in omp.single
+! FIR:       omp.single copyprivate(%[[DESC]]
+! FIR:         fir.store
+! FIR:         fir.declare
+! FIR:         fir.load
+! FIR:         fir.store
+! The forall loop accesses pointer target (shared memory) - must stay in single
+! FIR:         fir.do_loop
+! FIR:           fir.array_coor
+! FIR:           fir.store
+! FIR:         omp.terminator
+! FIR:       }
+! FIR:       omp.barrier
+! FIR:       omp.terminator
+
+! At LLVM IR level, verify the OpenMP fork call exists and the loop body
+! is inside the outlined function.
+! LLVM:     call void {{.*}}__kmpc_fork_call
+! LLVM:     define internal void @test_workshare_firstprivate_pointer_..omp_par
+! The single construct must be present in the outlined function
+! LLVM:       call i32 @__kmpc_single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d6000c989515b..12bae176b70d2 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -403,3 +403,83 @@ func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
 // CHECK:         }
 // CHECK:         omp.barrier
 // CHECK:       }
+
+
+// Check that a store through a pointer dereference is NOT considered
+// thread-local, even if the pointer descriptor itself is in a thread-local
+// alloca. This models the "parallel workshare firstprivate(P)" case where P
+// is a Fortran POINTER: each thread gets its own copy of the descriptor, but
+// P(i) accesses shared target data through the pointer.
+
+// CHECK-LABEL: func.func @pointer_deref_not_thread_local
+func.func @pointer_deref_not_thread_local() {
+  omp.parallel {
+    // Thread-local alloca for the pointer descriptor (models firstprivate)
+    %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    %decl = fir.declare %desc {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "p"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+    omp.workshare {
+      // Load the pointer box and access the target data via array_coor.
+      // Even though %desc is thread-local, the target data is shared.
+      %box = fir.load %decl : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %c0 = arith.constant 0 : index
+      %dims:3 = fir.box_dims %box, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+      %shift = fir.shift %dims#0 : (index) -> !fir.shift<1>
+      %c1_i64 = arith.constant 1 : i64
+      %elem = fir.array_coor %box(%shift) %c1_i64 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      %c42 = arith.constant 42 : i32
+      // This store goes to shared target data (through pointer deref),
+      // so it MUST be in omp.single, not parallelized.
+      fir.store %c42 to %elem : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The store through the pointer dereference must be inside omp.single.
+// CHECK:       omp.parallel {
+// CHECK:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:         omp.single
+// CHECK:           fir.load
+// CHECK:           fir.box_dims
+// CHECK:           fir.array_coor
+// CHECK:           fir.store
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK:         omp.barrier
+// CHECK:       }
+
+
+// Check that a direct store to the pointer descriptor alloca (not through
+// the pointer target) IS still recognized as thread-local.
+
+// CHECK-LABEL: func.func @pointer_descriptor_store_is_thread_local
+func.func @pointer_descriptor_store_is_thread_local() {
+  omp.parallel {
+    %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    omp.workshare {
+      %null = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+      %c0 = arith.constant 0 : index
+      %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+      %box = fir.embox %null(%shape) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+      // This store updates the descriptor itself (thread-local alloca),
+      // NOT the pointer target, so it should be parallelized.
+      fir.store %box to %desc : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The store to the descriptor alloca is thread-local and should NOT be in omp.single.
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:         fir.zero_bits
+// CHECK:         fir.shape
+// CHECK:         fir.embox
+// CHECK:         fir.store {{.*}} to %[[DESC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }

>From 7472b41f54f62b4314b1a2508a270c80bbebe6c1 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 00:31:42 -0500
Subject: [PATCH 2/3] Taking care of referenced type values.

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index cf51cb887622f..fe7064b7cda0b 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -361,9 +361,21 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
     if (auto reloaded = rootMapping.lookupOrNull(v))
       return nullptr;
     Type ty = v.getType();
-    Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty);
-    fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc);
-    Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc);
+    // fir.alloca cannot wrap fir.ref, so for reference-typed values
+    // (e.g. results of dynamic fir.alloca ops) use fir.heap as the
+    // intermediary pointer type for the broadcast alloca.
+    Type allocTy = ty;
+    if (auto rt = mlir::dyn_cast<fir::ReferenceType>(ty))
+      allocTy = fir::HeapType::get(rt.getEleTy());
+    Value alloc = fir::AllocaOp::create(allocaBuilder, loc, allocTy);
+    Value singleVal = singleMapping.lookup(v);
+    if (allocTy != ty)
+      singleVal =
+          fir::ConvertOp::create(singleBuilder, loc, allocTy, singleVal);
+    fir::StoreOp::create(singleBuilder, loc, singleVal, alloc);
+    Value reloaded = fir::LoadOp::create(parallelBuilder, loc, allocTy, alloc);
+    if (allocTy != ty)
+      reloaded = fir::ConvertOp::create(parallelBuilder, loc, ty, reloaded);
     rootMapping.map(v, reloaded);
     return alloc;
   };
@@ -423,13 +435,30 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
           allParallelized = false;
         }
       } else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
-        auto hoisted =
-            cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
-        rootMapping.map(&*alloca, &*hoisted);
-        rootMapping.map(alloca.getResult(), hoisted.getResult());
-        copyPrivate.push_back(hoisted);
-        hoistedCopyprivateAllocas.insert(alloca.getResult());
-        allParallelized = false;
+        if (alloca.isDynamic()) {
+          // Dynamic allocas (e.g. firstprivate arrays with runtime extent)
+          // cannot use the simple load/store copyprivate copy function
+          // because it only copies a single element for sequence types like
+          // !fir.array<?xi32>. Instead, keep the alloca in the single block
+          // and broadcast only its pointer to all threads.
+          singleBuilder.clone(op, singleMapping);
+          if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
+            auto alloc =
+                mapReloadedValue(alloca.getResult(), allocaBuilder,
+                                 singleBuilder, parallelBuilder, singleMapping);
+            if (alloc)
+              copyPrivate.push_back(alloc);
+          }
+          allParallelized = false;
+        } else {
+          auto hoisted =
+              cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+          rootMapping.map(&*alloca, &*hoisted);
+          rootMapping.map(alloca.getResult(), hoisted.getResult());
+          copyPrivate.push_back(hoisted);
+          hoistedCopyprivateAllocas.insert(alloca.getResult());
+          allParallelized = false;
+        }
       } else {
         singleBuilder.clone(op, singleMapping);
         // Prepare reloaded values for results of operations that cannot be

>From ed4377c87cb14b338d503e0791d38303e7ac7285 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 03:55:28 -0500
Subject: [PATCH 3/3] Changes in test cases to check for array. In "parallel
 workshare firstprivate(z)" z is an array.

---
 .../OpenMP/workshare-firstprivate-pointer.f90 | 37 +++++++++++++
 .../OpenMP/lower-workshare-thread-local.mlir  | 54 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index 5e08c2dd161ec..bac5e4dcdc6a0 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -62,3 +62,40 @@ subroutine test_workshare_firstprivate_pointer(P)
 ! LLVM:     define internal void @test_workshare_firstprivate_pointer_..omp_par
 ! The single construct must be present in the outlined function
 ! LLVM:       call i32 @__kmpc_single
+
+! Test for "workshare firstprivate(z)" where z is an array.
+! Check code to correctly broadcast the address of the firstprivate
+! copy to all threads, instead of using a broken load/store copyprivate
+! that only copies a single element for dynamically-sized arrays.
+
+subroutine test_workshare_firstprivate_array(a, z, n)
+  integer(4) :: n
+  integer(4), dimension(n) :: z, a
+  !$omp parallel workshare firstprivate(z)
+  a = z + 1
+  !$omp end parallel workshare
+end subroutine
+
+! After workshare lowering, the dynamic alloca for the firstprivate copy
+! must be inside omp.single, with its address broadcast via a !fir.heap
+! indirection alloca + copyprivate.
+! FIR:     func.func @_QPtest_workshare_firstprivate_array
+! FIR:     omp.parallel {
+! The heap indirection alloca is hoisted for copyprivate
+! FIR:       fir.alloca !fir.heap<!fir.array<?xi32>>
+! FIR:       omp.single copyprivate(
+! The dynamic alloca (firstprivate copy) is inside the single block
+! FIR:         fir.alloca !fir.array<?xi32>
+! FIR:         fir.convert {{.*}} -> !fir.heap<!fir.array<?xi32>>
+! FIR:         fir.store
+! The initialization of the firstprivate copy
+! FIR:         fir.call @_FortranAAssign
+! FIR:         omp.terminator
+! FIR:       }
+! After single, the address is loaded and converted back
+! FIR:       fir.load
+! FIR:       fir.convert {{.*}} -> !fir.ref<!fir.array<?xi32>>
+! The workshared loop uses the broadcast address
+! FIR:       omp.wsloop
+! FIR:       omp.barrier
+! FIR:       omp.terminator
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index 12bae176b70d2..c938860b1fc1f 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -483,3 +483,57 @@ func.func @pointer_descriptor_store_is_thread_local() {
 // CHECK-NEXT:    omp.barrier
 // CHECK-NEXT:    omp.terminator
 // CHECK-NEXT:  }
+
+// -----
+
+// Test for "parallel workshare firstprivate(z)" where z is an array.
+// Check that z is broadcast to all private values of the threads.
+
+// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array
+func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
+  omp.parallel {
+    omp.workshare {
+      // Dynamic alloca for the firstprivate array copy
+      %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
+      %shape = fir.shape %n : (index) -> !fir.shape<1>
+      %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+      // A side-effecting op that initializes the firstprivate copy
+      "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+      // Workshared loop that reads from the firstprivate array
+      %c1 = arith.constant 1 : index
+      omp.workshare.loop_wrapper {
+        omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
+          %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+          %val = fir.load %elem : !fir.ref<i32>
+          %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+          fir.store %val to %dst_elem : !fir.ref<i32>
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The dynamic alloca must be INSIDE the omp.single (not hoisted).
+// A !fir.heap indirection alloca is hoisted for copyprivate.
+// After the single, the array address is loaded and converted back to !fir.ref.
+// CHECK:       omp.parallel {
+// CHECK:         %[[PTR_ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
+// CHECK:         omp.single copyprivate(%[[PTR_ALLOC]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>)
+// The dynamic alloca is inside the single block
+// CHECK:           fir.alloca !fir.array<?xi32>
+// CHECK:           fir.convert {{.*}} : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK:           fir.store {{.*}} to %[[PTR_ALLOC]]
+// CHECK:           "test.init"
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// After single, load the broadcast array address and convert back to ref
+// CHECK:         fir.load %[[PTR_ALLOC]]
+// CHECK:         fir.convert {{.*}} : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// The workshared loop uses the broadcast array address
+// CHECK:         omp.wsloop
+// CHECK:         omp.barrier
+// CHECK:       }