[flang-commits] [flang] [flang][OpenMP] incorrect handling for local variable in OpenMP parallel workshare firstprivate(P) (PR #195616)

Wed Jun 3 04:11:05 PDT 2026

https://github.com/SunilKuravinakop updated https://github.com/llvm/llvm-project/pull/195616

>From dfb65ee076ec5787675df968edc03cc0021b1892 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 4 May 2026 03:39:17 -0500
Subject: [PATCH 1/6] Changes to handle "!$omp parallel workshare
 firstprivate(P)" where P is an array. Handling the creation and
 initialization of the local copy properly.

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 32 +++++++-
 .../OpenMP/workshare-firstprivate-pointer.f90 | 64 +++++++++++++++
 .../OpenMP/lower-workshare-thread-local.mlir  | 80 +++++++++++++++++++
 3 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index a41d8d8826501..cf51cb887622f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -156,6 +156,11 @@ static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
   fir::AliasAnalysis aliasAnalysis;
   fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
 
+  // With firstprivate(P) where P is a pointer, each thread gets its own copy
+  // of the descriptor, but P(i) accesses shared target data.
+  if (source.accessPath.hasPointerDeref())
+    return false;
+
   // Check if the source is a Value (not a global symbol).
   mlir::Value sourceValue =
       llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
@@ -370,10 +375,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
     SmallVector<Value> copyPrivate;
     bool allParallelized = true;
 
+    // "firstprivate" pointer initialization creates: (1) alloca, (2) store
+    // null box, (3) copy original. If step (2) is duplicated into the
+    // parallel block, it runs after initialization of the private copy and
+    // overwrites the pointer descriptor with null, causing a segfault on
+    // dereference.
+    SmallPtrSet<Value, 4> hoistedCopyprivateAllocas;
+
     for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
       if (isSafeToParallelize(&op)) {
         singleBuilder.clone(op, singleMapping);
-        if (llvm::all_of(op.getOperands(), [&](Value opr) {
+        // Check if this operation writes to a hoisted copyprivate alloca.
+        // Such stores must stay only in the single block; the copyprivate
+        // mechanism handles broadcasting the final value to all threads.
+        bool writesToCopyprivateAlloca = false;
+        if (!hoistedCopyprivateAllocas.empty()) {
+          if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(&op)) {
+            SmallVector<MemoryEffects::EffectInstance> effects;
+            memEffects.getEffects(effects);
+            writesToCopyprivateAlloca =
+                llvm::any_of(effects, [&](const auto &eff) {
+                  return isa<MemoryEffects::Write>(eff.getEffect()) &&
+                         eff.getValue() &&
+                         hoistedCopyprivateAllocas.contains(eff.getValue());
+                });
+          }
+        }
+        if (!writesToCopyprivateAlloca &&
+            llvm::all_of(op.getOperands(), [&](Value opr) {
               // Either we have already remapped it
               bool remapped = rootMapping.contains(opr);
               // Or it is available because it dominates `sr`
@@ -399,6 +428,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
         rootMapping.map(&*alloca, &*hoisted);
         rootMapping.map(alloca.getResult(), hoisted.getResult());
         copyPrivate.push_back(hoisted);
+        hoistedCopyprivateAllocas.insert(alloca.getResult());
         allParallelized = false;
       } else {
         singleBuilder.clone(op, singleMapping);
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
new file mode 100644
index 0000000000000..5e08c2dd161ec
--- /dev/null
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -0,0 +1,64 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
+!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
+!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+
+! Test that parallel workshare with firstprivate(P) where P is a pointer
+! correctly places stores through the pointer target in omp.single rather
+! than parallelizing them. The pointer descriptor is thread-local (firstprivate),
+! but the target data is shared memory.
+
+subroutine test_workshare_firstprivate_pointer(P)
+  integer, pointer, intent(in) :: P(:)
+  integer :: i
+  !$omp parallel workshare firstprivate(P)
+  forall (i = 1:SIZE(P)) P(i) = i
+  !$omp end parallel workshare
+end subroutine
+
+! HLFIR:     omp.parallel {
+! HLFIR:       omp.workshare {
+! The firstprivate copy: alloca, zero-init, declare, then copy from original
+! HLFIR:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR:         fir.store
+! HLFIR:         hlfir.declare
+! HLFIR:         fir.load
+! HLFIR:         fir.store
+! HLFIR:         hlfir.forall
+! HLFIR:         omp.terminator
+! HLFIR:       }
+! HLFIR:       omp.terminator
+! HLFIR:     }
+
+! After workshare lowering, the forall body (which stores through the pointer
+! target) must be inside omp.single, not parallelized.
+! FIR:     omp.parallel {
+! FIR:       %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! The firstprivate init + copy and the forall loop must be in omp.single
+! FIR:       omp.single copyprivate(%[[DESC]]
+! FIR:         fir.store
+! FIR:         fir.declare
+! FIR:         fir.load
+! FIR:         fir.store
+! The forall loop accesses pointer target (shared memory) - must stay in single
+! FIR:         fir.do_loop
+! FIR:           fir.array_coor
+! FIR:           fir.store
+! FIR:         omp.terminator
+! FIR:       }
+! FIR:       omp.barrier
+! FIR:       omp.terminator
+
+! At LLVM IR level, verify the OpenMP fork call exists and the loop body
+! is inside the outlined function.
+! LLVM:     call void {{.*}}__kmpc_fork_call
+! LLVM:     define internal void @test_workshare_firstprivate_pointer_..omp_par
+! The single construct must be present in the outlined function
+! LLVM:       call i32 @__kmpc_single
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index d6000c989515b..12bae176b70d2 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -403,3 +403,83 @@ func.func @forall_pattern_in_workshare(%shared: !fir.ref<i32>) {
 // CHECK:         }
 // CHECK:         omp.barrier
 // CHECK:       }
+
+
+// Check that a store through a pointer dereference is NOT considered
+// thread-local, even if the pointer descriptor itself is in a thread-local
+// alloca. This models the "parallel workshare firstprivate(P)" case where P
+// is a Fortran POINTER: each thread gets its own copy of the descriptor, but
+// P(i) accesses shared target data through the pointer.
+
+// CHECK-LABEL: func.func @pointer_deref_not_thread_local
+func.func @pointer_deref_not_thread_local() {
+  omp.parallel {
+    // Thread-local alloca for the pointer descriptor (models firstprivate)
+    %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    %decl = fir.declare %desc {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "p"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+    omp.workshare {
+      // Load the pointer box and access the target data via array_coor.
+      // Even though %desc is thread-local, the target data is shared.
+      %box = fir.load %decl : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      %c0 = arith.constant 0 : index
+      %dims:3 = fir.box_dims %box, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+      %shift = fir.shift %dims#0 : (index) -> !fir.shift<1>
+      %c1_i64 = arith.constant 1 : i64
+      %elem = fir.array_coor %box(%shift) %c1_i64 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, i64) -> !fir.ref<i32>
+      %c42 = arith.constant 42 : i32
+      // This store goes to shared target data (through pointer deref),
+      // so it MUST be in omp.single, not parallelized.
+      fir.store %c42 to %elem : !fir.ref<i32>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The store through the pointer dereference must be inside omp.single.
+// CHECK:       omp.parallel {
+// CHECK:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:         omp.single
+// CHECK:           fir.load
+// CHECK:           fir.box_dims
+// CHECK:           fir.array_coor
+// CHECK:           fir.store
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// CHECK:         omp.barrier
+// CHECK:       }
+
+
+// Check that a direct store to the pointer descriptor alloca (not through
+// the pointer target) IS still recognized as thread-local.
+
+// CHECK-LABEL: func.func @pointer_descriptor_store_is_thread_local
+func.func @pointer_descriptor_store_is_thread_local() {
+  omp.parallel {
+    %desc = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+    omp.workshare {
+      %null = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+      %c0 = arith.constant 0 : index
+      %shape = fir.shape %c0 : (index) -> !fir.shape<1>
+      %box = fir.embox %null(%shape) : (!fir.ptr<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+      // This store updates the descriptor itself (thread-local alloca),
+      // NOT the pointer target, so it should be parallelized.
+      fir.store %box to %desc : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The store to the descriptor alloca is thread-local and should NOT be in omp.single.
+// CHECK:       omp.parallel {
+// CHECK-NEXT:    %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+// CHECK:         fir.zero_bits
+// CHECK:         fir.shape
+// CHECK:         fir.embox
+// CHECK:         fir.store {{.*}} to %[[DESC]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK-NEXT:    omp.barrier
+// CHECK-NEXT:    omp.terminator
+// CHECK-NEXT:  }

>From 7472b41f54f62b4314b1a2508a270c80bbebe6c1 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 00:31:42 -0500
Subject: [PATCH 2/6] Taking care of referenced type values.

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index cf51cb887622f..fe7064b7cda0b 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -361,9 +361,21 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
     if (auto reloaded = rootMapping.lookupOrNull(v))
       return nullptr;
     Type ty = v.getType();
-    Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty);
-    fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc);
-    Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc);
+    // fir.alloca cannot wrap fir.ref, so for reference-typed values
+    // (e.g. results of dynamic fir.alloca ops) use fir.heap as the
+    // intermediary pointer type for the broadcast alloca.
+    Type allocTy = ty;
+    if (auto rt = mlir::dyn_cast<fir::ReferenceType>(ty))
+      allocTy = fir::HeapType::get(rt.getEleTy());
+    Value alloc = fir::AllocaOp::create(allocaBuilder, loc, allocTy);
+    Value singleVal = singleMapping.lookup(v);
+    if (allocTy != ty)
+      singleVal =
+          fir::ConvertOp::create(singleBuilder, loc, allocTy, singleVal);
+    fir::StoreOp::create(singleBuilder, loc, singleVal, alloc);
+    Value reloaded = fir::LoadOp::create(parallelBuilder, loc, allocTy, alloc);
+    if (allocTy != ty)
+      reloaded = fir::ConvertOp::create(parallelBuilder, loc, ty, reloaded);
     rootMapping.map(v, reloaded);
     return alloc;
   };
@@ -423,13 +435,30 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
           allParallelized = false;
         }
       } else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
-        auto hoisted =
-            cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
-        rootMapping.map(&*alloca, &*hoisted);
-        rootMapping.map(alloca.getResult(), hoisted.getResult());
-        copyPrivate.push_back(hoisted);
-        hoistedCopyprivateAllocas.insert(alloca.getResult());
-        allParallelized = false;
+        if (alloca.isDynamic()) {
+          // Dynamic allocas (e.g. firstprivate arrays with runtime extent)
+          // cannot use the simple load/store copyprivate copy function
+          // because it only copies a single element for sequence types like
+          // !fir.array<?xi32>. Instead, keep the alloca in the single block
+          // and broadcast only its pointer to all threads.
+          singleBuilder.clone(op, singleMapping);
+          if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
+            auto alloc =
+                mapReloadedValue(alloca.getResult(), allocaBuilder,
+                                 singleBuilder, parallelBuilder, singleMapping);
+            if (alloc)
+              copyPrivate.push_back(alloc);
+          }
+          allParallelized = false;
+        } else {
+          auto hoisted =
+              cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+          rootMapping.map(&*alloca, &*hoisted);
+          rootMapping.map(alloca.getResult(), hoisted.getResult());
+          copyPrivate.push_back(hoisted);
+          hoistedCopyprivateAllocas.insert(alloca.getResult());
+          allParallelized = false;
+        }
       } else {
         singleBuilder.clone(op, singleMapping);
         // Prepare reloaded values for results of operations that cannot be

>From ed4377c87cb14b338d503e0791d38303e7ac7285 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 5 May 2026 03:55:28 -0500
Subject: [PATCH 3/6] Changes in test cases to check for array. In "parallel
 workshare firstprivate(z)" z is an array.

---
 .../OpenMP/workshare-firstprivate-pointer.f90 | 37 +++++++++++++
 .../OpenMP/lower-workshare-thread-local.mlir  | 54 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index 5e08c2dd161ec..bac5e4dcdc6a0 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -62,3 +62,40 @@ subroutine test_workshare_firstprivate_pointer(P)
 ! LLVM:     define internal void @test_workshare_firstprivate_pointer_..omp_par
 ! The single construct must be present in the outlined function
 ! LLVM:       call i32 @__kmpc_single
+
+! Test for "workshare firstprivate(z)" where z is an array.
+! Check code to correctly broadcast the address of the firstprivate
+! copy to all threads, instead of using a broken load/store copyprivate
+! that only copies a single element for dynamically-sized arrays.
+
+subroutine test_workshare_firstprivate_array(a, z, n)
+  integer(4) :: n
+  integer(4), dimension(n) :: z, a
+  !$omp parallel workshare firstprivate(z)
+  a = z + 1
+  !$omp end parallel workshare
+end subroutine
+
+! After workshare lowering, the dynamic alloca for the firstprivate copy
+! must be inside omp.single, with its address broadcast via a !fir.heap
+! indirection alloca + copyprivate.
+! FIR:     func.func @_QPtest_workshare_firstprivate_array
+! FIR:     omp.parallel {
+! The heap indirection alloca is hoisted for copyprivate
+! FIR:       fir.alloca !fir.heap<!fir.array<?xi32>>
+! FIR:       omp.single copyprivate(
+! The dynamic alloca (firstprivate copy) is inside the single block
+! FIR:         fir.alloca !fir.array<?xi32>
+! FIR:         fir.convert {{.*}} -> !fir.heap<!fir.array<?xi32>>
+! FIR:         fir.store
+! The initialization of the firstprivate copy
+! FIR:         fir.call @_FortranAAssign
+! FIR:         omp.terminator
+! FIR:       }
+! After single, the address is loaded and converted back
+! FIR:       fir.load
+! FIR:       fir.convert {{.*}} -> !fir.ref<!fir.array<?xi32>>
+! The workshared loop uses the broadcast address
+! FIR:       omp.wsloop
+! FIR:       omp.barrier
+! FIR:       omp.terminator
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index 12bae176b70d2..c938860b1fc1f 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -483,3 +483,57 @@ func.func @pointer_descriptor_store_is_thread_local() {
 // CHECK-NEXT:    omp.barrier
 // CHECK-NEXT:    omp.terminator
 // CHECK-NEXT:  }
+
+// -----
+
+// Test for "parallel workshare firstprivate(z)" where z is an array.
+// Check that z is broadcast to all private values of the threads.
+
+// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array
+func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
+  omp.parallel {
+    omp.workshare {
+      // Dynamic alloca for the firstprivate array copy
+      %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
+      %shape = fir.shape %n : (index) -> !fir.shape<1>
+      %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+      // A side-effecting op that initializes the firstprivate copy
+      "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+      // Workshared loop that reads from the firstprivate array
+      %c1 = arith.constant 1 : index
+      omp.workshare.loop_wrapper {
+        omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
+          %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+          %val = fir.load %elem : !fir.ref<i32>
+          %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+          fir.store %val to %dst_elem : !fir.ref<i32>
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// The dynamic alloca must be INSIDE the omp.single (not hoisted).
+// A !fir.heap indirection alloca is hoisted for copyprivate.
+// After the single, the array address is loaded and converted back to !fir.ref.
+// CHECK:       omp.parallel {
+// CHECK:         %[[PTR_ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
+// CHECK:         omp.single copyprivate(%[[PTR_ALLOC]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>)
+// The dynamic alloca is inside the single block
+// CHECK:           fir.alloca !fir.array<?xi32>
+// CHECK:           fir.convert {{.*}} : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+// CHECK:           fir.store {{.*}} to %[[PTR_ALLOC]]
+// CHECK:           "test.init"
+// CHECK:           omp.terminator
+// CHECK-NEXT:    }
+// After single, load the broadcast array address and convert back to ref
+// CHECK:         fir.load %[[PTR_ALLOC]]
+// CHECK:         fir.convert {{.*}} : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
+// The workshared loop uses the broadcast array address
+// CHECK:         omp.wsloop
+// CHECK:         omp.barrier
+// CHECK:       }

>From c2dd9ddd99e52242b5f138894b7fd71a44aa40f8 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 13 May 2026 11:35:48 -0500
Subject: [PATCH 4/6] 1) Making the checks in the tests detailed based on
 feedback. 2) Using fir.box and fir.embox for private copies in the "omp
 parallel workshare    firstprivate(p)".

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp |  37 +++-
 .../OpenMP/workshare-firstprivate-pointer.f90 | 209 ++++++++++++++----
 .../OpenMP/lower-workshare-thread-local.mlir  |  49 +---
 3 files changed, 200 insertions(+), 95 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index fe7064b7cda0b..bc704de66865f 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -362,11 +362,11 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
       return nullptr;
     Type ty = v.getType();
     // fir.alloca cannot wrap fir.ref, so for reference-typed values
-    // (e.g. results of dynamic fir.alloca ops) use fir.heap as the
+    // (e.g. results of dynamic fir.alloca ops) use fir.ptr as the
     // intermediary pointer type for the broadcast alloca.
     Type allocTy = ty;
     if (auto rt = mlir::dyn_cast<fir::ReferenceType>(ty))
-      allocTy = fir::HeapType::get(rt.getEleTy());
+      allocTy = fir::PointerType::get(rt.getEleTy());
     Value alloc = fir::AllocaOp::create(allocaBuilder, loc, allocTy);
     Value singleVal = singleMapping.lookup(v);
     if (allocTy != ty)
@@ -440,14 +440,35 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
           // cannot use the simple load/store copyprivate copy function
           // because it only copies a single element for sequence types like
           // !fir.array<?xi32>. Instead, keep the alloca in the single block
-          // and broadcast only its pointer to all threads.
+          // and broadcast its address via a box to all threads. The box
+          // preserves shape information and is semantically correct for
+          // copyprivate.
           singleBuilder.clone(op, singleMapping);
           if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
-            auto alloc =
-                mapReloadedValue(alloca.getResult(), allocaBuilder,
-                                 singleBuilder, parallelBuilder, singleMapping);
-            if (alloc)
-              copyPrivate.push_back(alloc);
+            Value clonedResult = singleMapping.lookup(alloca.getResult());
+            if (!rootMapping.lookupOrNull(alloca.getResult())) {
+              // Create a box type wrapping the allocated array type.
+              Type eleTy =
+                  cast<fir::ReferenceType>(alloca.getType()).getEleTy();
+              auto boxTy = fir::BoxType::get(eleTy);
+              Value boxAlloc = fir::AllocaOp::create(allocaBuilder, loc, boxTy);
+              // In single: create a shape from the alloca extents, embox
+              // the array, and store the box.
+              SmallVector<Value> extents;
+              for (Value ext : alloca.getShape())
+                extents.push_back(singleMapping.lookupOrDefault(ext));
+              Value shape = fir::ShapeOp::create(singleBuilder, loc, extents);
+              Value box = fir::EmboxOp::create(singleBuilder, loc, boxTy,
+                                               clonedResult, shape);
+              fir::StoreOp::create(singleBuilder, loc, box, boxAlloc);
+              // After single: load the box and extract the address.
+              Value loadedBox =
+                  fir::LoadOp::create(parallelBuilder, loc, boxTy, boxAlloc);
+              Value addr = fir::BoxAddrOp::create(parallelBuilder, loc,
+                                                  alloca.getType(), loadedBox);
+              rootMapping.map(alloca.getResult(), addr);
+              copyPrivate.push_back(boxAlloc);
+            }
           }
           allParallelized = false;
         } else {
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index bac5e4dcdc6a0..eb266a450e55d 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -9,6 +9,7 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
 !RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
 !RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | fir-opt --lower-workshare --allow-unregistered-dialect -o - | FileCheck %s --check-prefix FIROPT
 
 ! Test that parallel workshare with firstprivate(P) where P is a pointer
 ! correctly places stores through the pointer target in omp.single rather
@@ -23,45 +24,162 @@ subroutine test_workshare_firstprivate_pointer(P)
   !$omp end parallel workshare
 end subroutine
 
-! HLFIR:     omp.parallel {
-! HLFIR:       omp.workshare {
+! HLFIR-LABEL: {{.*}}test_workshare_firstprivate_pointer{{.*}} {
+! HLFIR:     %[[ORIG_P:.*]]:2 = hlfir.declare %{{.*}} {{.*}}uniq_name = "_QFtest_workshare_firstprivate_pointerEp"
+! HLFIR-LABEL:     omp.parallel {
+! HLFIR-LABEL:       omp.workshare {
 ! The firstprivate copy: alloca, zero-init, declare, then copy from original
-! HLFIR:         fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
-! HLFIR:         fir.store
-! HLFIR:         hlfir.declare
-! HLFIR:         fir.load
-! HLFIR:         fir.store
+! HLFIR:         %[[FP_ALLOCA:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! HLFIR:         fir.store %{{.*}} to %[[FP_ALLOCA]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! HLFIR:         %[[FP_DECL:.*]]:2 = hlfir.declare %[[FP_ALLOCA]] {{{.*}}uniq_name = "_QFtest_workshare_firstprivate_pointerEp"}
+! HLFIR:         %[[ORIG_VAL:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! HLFIR:         fir.store %[[ORIG_VAL]] to %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! HLFIR:         hlfir.forall
 ! HLFIR:         omp.terminator
 ! HLFIR:       }
 ! HLFIR:       omp.terminator
 ! HLFIR:     }
+! HLFIR:     return
 
 ! After workshare lowering, the forall body (which stores through the pointer
 ! target) must be inside omp.single, not parallelized.
-! FIR:     omp.parallel {
-! FIR:       %[[DESC:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
-! The firstprivate init + copy and the forall loop must be in omp.single
-! FIR:       omp.single copyprivate(%[[DESC]]
-! FIR:         fir.store
-! FIR:         fir.declare
-! FIR:         fir.load
-! FIR:         fir.store
-! The forall loop accesses pointer target (shared memory) - must stay in single
-! FIR:         fir.do_loop
-! FIR:           fir.array_coor
-! FIR:           fir.store
-! FIR:         omp.terminator
-! FIR:       }
-! FIR:       omp.barrier
-! FIR:       omp.terminator
+! FIR: {{.*}}test_workshare_firstprivate_pointer
+! FIR-SAME: (%[[ARG0:.*]]: {{.*}}) {
+! FIR: %[[C1:.*]] = arith.constant 1 : index
+! FIR: %[[C1_I32:.*]] = arith.constant 1 : i32
+! FIR: %[[C0:.*]] = arith.constant 0 : index
+! FIR: %[[DSCOPE:.*]] = fir.dummy_scope{{.*}}
+! FIR: %[[P_DECL:.*]] = fir.declare %[[ARG0]]{{.*}}fortran_attrs = #fir.var_attrs<intent_in, pointer>{{.*}}
+! FIR: omp.parallel {
+! Thread-private storage for firstprivate pointer descriptor.
+! FIR: %[[P_PRIV:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {
+! FIR-SAME: bindc_name = "p"
+! FIR-SAME: pinned
+! FIR: omp.single copyprivate(%[[P_PRIV]]{{.*}} {
+! FIR: %[[ZERO_PTR:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+! FIR: %[[SHAPE:.*]] = fir.shape %[[C0]]
+! FIR: %[[EMPTY_BOX:.*]] = fir.embox %[[ZERO_PTR]](%[[SHAPE]])
+! FIR: fir.store %[[EMPTY_BOX]] to %[[P_PRIV]]
+! FIR-SAME: : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! FIR: %[[P_FP_DECL:.*]] = fir.declare %[[P_PRIV]]
+! FIR-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+! FIR: %[[ORIG_BOX:.*]] = fir.load %[[P_DECL]]
+! FIR-SAME: : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! FIR: fir.store %[[ORIG_BOX]] to %[[P_FP_DECL]]
+! FIR-SAME: : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! FIR: %[[P_PRIVATE:.*]] = fir.load %[[P_FP_DECL]]
+! FIR: %[[P_SIZE:.*]]:3 = fir.box_dims %[[P_PRIVATE]], %[[C0]]
+! FIR: %[[SIZE_TMP1:.*]] = fir.convert %[[P_SIZE]]#1
+! FIR: %[[SIZE_TMP2:.*]] = fir.convert %[[SIZE_TMP1]]
+! FIR: %[[LOOP_LB:.*]] = fir.convert %[[C1_I32]]
+! FIR: %[[LOOP_UB:.*]] = fir.convert %[[SIZE_TMP2]]
+! FIR: fir.do_loop %[[IV:.*]] = %[[LOOP_LB]] to %[[LOOP_UB]] step %[[C1]] {
+! FIR: %[[IV_VAL:.*]] = fir.convert %[[IV]]
+! FIR: fir.store %[[IV_VAL]] to %[[I_PRIV:.*]] : !fir.ref<i32>
+! FIR: %[[RHS_STORE_VAL:.*]] = fir.load %[[I_PRIV]] : !fir.ref<i32>
+! FIR: %[[P_CUR:.*]] = fir.load %[[P_FP_DECL]]
+! FIR: %[[LHS_ELEM_ADDR:.*]] = fir.array_coor %[[P_CUR]]
+! FIR: fir.store %[[RHS_STORE_VAL]] to %[[LHS_ELEM_ADDR]] : !fir.ref<i32>
+! FIR: omp.terminator
+! FIR: }
+! FIR: omp.barrier
+! FIR: omp.terminator
+! FIR: }
+! FIR: return
+
+! FIROPT: func.func @_QPtest_workshare_firstprivate_pointer(
+! FIROPT-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+
+! FIROPT: %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! FIROPT: %[[I_ALLOC:.*]] = fir.alloca i32
+! FIROPT: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_ALLOC]]
+
+! FIROPT: %[[P_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1
+! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+
+! FIROPT: omp.parallel {
+
+! FIROPT: %[[P_PRIV:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", pinned
+
+! FIROPT: omp.single copyprivate(%[[P_PRIV]] -> @_workshare_copy_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) nowait {
+
+! FIROPT: %[[ZERO_PTR:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
+! FIROPT: %[[C0:.*]] = arith.constant 0 : index
+! FIROPT: %[[SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1>
+! FIROPT: %[[EMBOX:.*]] = fir.embox %[[ZERO_PTR]](%[[SHAPE]])
+! FIROPT: fir.store %[[EMBOX]] to %[[P_PRIV]]
+
+! FIROPT: %[[P_FP:.*]]:2 = hlfir.declare %[[P_PRIV]]
+! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+
+! FIROPT: %[[LOAD_ORIG:.*]] = fir.load %[[P_DECL]]#0
+! FIROPT: fir.store %[[LOAD_ORIG]] to %[[P_FP]]#0
+
+! FIROPT: %[[C1:.*]] = arith.constant 1 : i32
+
+! FIROPT: %[[LOAD_PRIV:.*]] = fir.load %[[P_FP]]#0
+! FIROPT: %[[C0_2:.*]] = arith.constant 0 : index
+! FIROPT: %[[DIMS:.*]]:3 = fir.box_dims %[[LOAD_PRIV]], %[[C0_2]]
+
+! FIROPT: %[[EXT64:.*]] = fir.convert %[[DIMS]]#1 : (index) -> i64
+! FIROPT: %[[EXT32:.*]] = fir.convert %[[EXT64]] : (i64) -> i32
+
+! FIROPT: hlfir.forall lb {
+! FIROPT: hlfir.yield %[[C1]] : i32
+! FIROPT: } ub {
+! FIROPT: hlfir.yield %[[EXT32]] : i32
+! FIROPT: }  (%[[IV:.*]]: i32) {
+
+! FIROPT: %[[IDX:.*]] = hlfir.forall_index "i" %[[IV]] : (i32) -> !fir.ref<i32>
+
+! FIROPT: hlfir.region_assign {
+
+! FIROPT: %[[IDX_VAL:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
+! FIROPT: hlfir.yield %[[IDX_VAL]] : i32
+
+! FIROPT: } to {
+
+! FIROPT: %[[LOAD_BOX:.*]] = fir.load %[[P_FP]]#0
+! FIROPT: %[[LOAD_I:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
+! FIROPT: %[[IDX64:.*]] = fir.convert %[[LOAD_I]] : (i32) -> i64
+
+! FIROPT: %[[DESIG:.*]] = hlfir.designate %[[LOAD_BOX]] (%[[IDX64]])
+! FIROPT-SAME: -> !fir.ref<i32>
+
+! FIROPT: hlfir.yield %[[DESIG]] : !fir.ref<i32>
+
+! FIROPT: }
+! FIROPT: }
+
+! FIROPT: omp.terminator
+! FIROPT: }
+
+! FIROPT: %[[POST_DECL:.*]]:2 = hlfir.declare %[[P_PRIV]]
+! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
+
+! FIROPT: omp.barrier
+! FIROPT: omp.terminator
+! FIROPT: }
 
 ! At LLVM IR level, verify the OpenMP fork call exists and the loop body
 ! is inside the outlined function.
-! LLVM:     call void {{.*}}__kmpc_fork_call
-! LLVM:     define internal void @test_workshare_firstprivate_pointer_..omp_par
-! The single construct must be present in the outlined function
+! LLVM:       call void {{.*}}__kmpc_fork_call({{.*}}@test_workshare_firstprivate_pointer_..omp_par{{.*}})
+! LLVM: {{.*}}test_workshare_firstprivate_pointer_..omp_par{{.*}}
+! LLVM-LABEL: omp.par.region{{[0-9]+}}:
 ! LLVM:       call i32 @__kmpc_single
+! LLVM:       icmp ne i32
+! LLVM-LABEL: omp_region.end:
+! LLVM:       call void @__kmpc_copyprivate
+! LLVM:       call void {{.*}}__kmpc_barrier
+! LLVM-LABEL: omp.single.region:
+! LLVM:       call void @llvm.memcpy{{.*}}
+! LLVM:       getelementptr {{.*}} i32 0, i32 7
+! LLVM:       load i64{{.*}}
+! LLVM-LABEL: omp_region.finalize:
+! LLVM:       call void @__kmpc_end_single
+! LLVM:       store i32 %{{.*}}, ptr %{{.*}}
+! LLVM:       getelementptr nusw nuw i8
+! LLVM:       ret void
 
 ! Test for "workshare firstprivate(z)" where z is an array.
 ! Check code to correctly broadcast the address of the firstprivate
@@ -77,25 +195,38 @@ subroutine test_workshare_firstprivate_array(a, z, n)
 end subroutine
 
 ! After workshare lowering, the dynamic alloca for the firstprivate copy
-! must be inside omp.single, with its address broadcast via a !fir.heap
+! must be inside omp.single, with its address broadcast via a !fir.box
 ! indirection alloca + copyprivate.
-! FIR:     func.func @_QPtest_workshare_firstprivate_array
-! FIR:     omp.parallel {
-! The heap indirection alloca is hoisted for copyprivate
-! FIR:       fir.alloca !fir.heap<!fir.array<?xi32>>
-! FIR:       omp.single copyprivate(
+! FIR-LABEL:     {{.*}}test_workshare_firstprivate_array(
+! FIR:           %[[C1:.*]] = arith.constant 1 : i32
+! FIR-LABEL:     omp.parallel {
+
+! The box indirection alloca is hoisted for copyprivate
+! FIR:       omp.single copyprivate(%[[BOX_INDIRECT:.*]] -> @_workshare_copy_box_Uxi32{{.*}}) {
+
 ! The dynamic alloca (firstprivate copy) is inside the single block
-! FIR:         fir.alloca !fir.array<?xi32>
-! FIR:         fir.convert {{.*}} -> !fir.heap<!fir.array<?xi32>>
-! FIR:         fir.store
+! FIR:         %[[FP_ARRAY:.*]] = fir.alloca{{.*}}
+
+! Runtime shape construction for the firstprivate array.
+! FIR:         %[[SHAPE:.*]] = fir.shape %{{.*}}
+! FIR:         %[[BOX_VAL:.*]] = fir.embox %[[FP_ARRAY]](%[[SHAPE]]){{.*}}fir.array<?xi32>{{.*}}
+! FIR:         fir.store %[[BOX_VAL]] to %[[BOX_INDIRECT]] {{.*}}fir.array<?xi32>{{.*}}
+
 ! The initialization of the firstprivate copy
 ! FIR:         fir.call @_FortranAAssign
 ! FIR:         omp.terminator
 ! FIR:       }
-! After single, the address is loaded and converted back
-! FIR:       fir.load
-! FIR:       fir.convert {{.*}} -> !fir.ref<!fir.array<?xi32>>
+! After single, the box is loaded and the address extracted
+! FIR:       %[[LOADED_BOX:.*]] = fir.load %[[BOX_INDIRECT]]{{.*}}fir.array<?xi32>{{.*}}
+! FIR:       %[[ARRAY_ADDR:.*]] = fir.box_addr %[[LOADED_BOX]]{{.*}}fir.array<?xi32>>{{.*}}
 ! The workshared loop uses the broadcast address
-! FIR:       omp.wsloop
+! FIR:       omp.wsloop {
+! FIR:         %[[SRC_ELEM:.*]] = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}}
+! FIR:         %[[SRC_VAL:.*]] = fir.load %[[SRC_ELEM]]
+! FIR:         %[[ADD_RES:.*]] = arith.addi %[[SRC_VAL]], %[[C1]] : i32
+! FIR:         %[[DST_ELEM:.*]] = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}}
+! FIR:         fir.store %[[ADD_RES]] to %[[DST_ELEM]]
+! FIR:       }
 ! FIR:       omp.barrier
 ! FIR:       omp.terminator
+! FIR:       return
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index c938860b1fc1f..e8cb9065123e7 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -488,52 +488,5 @@ func.func @pointer_descriptor_store_is_thread_local() {
 
 // Test for "parallel workshare firstprivate(z)" where z is an array.
 // Check that z is broadcast to all private values of the threads.
+// This test is now part of flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
 
-// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array
-func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
-  omp.parallel {
-    omp.workshare {
-      // Dynamic alloca for the firstprivate array copy
-      %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
-      %shape = fir.shape %n : (index) -> !fir.shape<1>
-      %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
-      // A side-effecting op that initializes the firstprivate copy
-      "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
-      // Workshared loop that reads from the firstprivate array
-      %c1 = arith.constant 1 : index
-      omp.workshare.loop_wrapper {
-        omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
-          %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
-          %val = fir.load %elem : !fir.ref<i32>
-          %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
-          fir.store %val to %dst_elem : !fir.ref<i32>
-          omp.yield
-        }
-      }
-      omp.terminator
-    }
-    omp.terminator
-  }
-  return
-}
-
-// The dynamic alloca must be INSIDE the omp.single (not hoisted).
-// A !fir.heap indirection alloca is hoisted for copyprivate.
-// After the single, the array address is loaded and converted back to !fir.ref.
-// CHECK:       omp.parallel {
-// CHECK:         %[[PTR_ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
-// CHECK:         omp.single copyprivate(%[[PTR_ALLOC]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>)
-// The dynamic alloca is inside the single block
-// CHECK:           fir.alloca !fir.array<?xi32>
-// CHECK:           fir.convert {{.*}} : (!fir.ref<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
-// CHECK:           fir.store {{.*}} to %[[PTR_ALLOC]]
-// CHECK:           "test.init"
-// CHECK:           omp.terminator
-// CHECK-NEXT:    }
-// After single, load the broadcast array address and convert back to ref
-// CHECK:         fir.load %[[PTR_ALLOC]]
-// CHECK:         fir.convert {{.*}} : (!fir.heap<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
-// The workshared loop uses the broadcast array address
-// CHECK:         omp.wsloop
-// CHECK:         omp.barrier
-// CHECK:       }

>From be6a825bac11c391a2f808c52514c7d52650c3e0 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 20 May 2026 03:27:07 -0500
Subject: [PATCH 5/6] 1) Correcting of having Private variables in every
 thread. 2) Moving checks for fir-opt back into   
 flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir from   
 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 3) Adding
 extra tests to   
 flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 145 +++++--
 .../OpenMP/workshare-firstprivate-pointer.f90 | 366 +++++++++++++-----
 .../OpenMP/lower-workshare-thread-local.mlir  |  73 +++-
 3 files changed, 453 insertions(+), 131 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index bc704de66865f..50463810d5025 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -301,6 +301,74 @@ static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType,
   return funcOp;
 }
 
+/// Creates a copy function for box types that copies the array DATA
+/// (not just the descriptor) using the Fortran runtime's Assign function.
+/// This is needed for copyprivate of dynamically-sized arrays where each
+/// thread has its own allocation and needs the data copied from the
+/// single-executing thread.
+static mlir::func::FuncOp createBoxDataCopyFunc(mlir::Location loc,
+                                                mlir::Type varType,
+                                                fir::FirOpBuilder builder) {
+  mlir::ModuleOp module = builder.getModule();
+  auto rt = cast<fir::ReferenceType>(varType);
+  mlir::Type eleTy = rt.getEleTy();
+  std::string copyFuncName =
+      fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy_data");
+
+  if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
+    return decl;
+
+  // Ensure _FortranAAssign is declared in the module.
+  auto boxNoneTy = fir::BoxType::get(builder.getNoneType());
+  auto refBoxNoneTy = fir::ReferenceType::get(boxNoneTy);
+  auto refI8Ty = fir::ReferenceType::get(builder.getIntegerType(8));
+  auto i32Ty = builder.getI32Type();
+  llvm::StringRef assignFuncName = "_FortranAAssign";
+  auto assignFunc = module.lookupSymbol<mlir::func::FuncOp>(assignFuncName);
+  if (!assignFunc) {
+    mlir::OpBuilder::InsertionGuard g(builder);
+    mlir::OpBuilder modBuilder(module.getBodyRegion());
+    auto assignFuncType = mlir::FunctionType::get(
+        builder.getContext(), {refBoxNoneTy, boxNoneTy, refI8Ty, i32Ty}, {});
+    assignFunc = mlir::func::FuncOp::create(modBuilder, loc, assignFuncName,
+                                            assignFuncType);
+    assignFunc.setVisibility(mlir::SymbolTable::Visibility::Private);
+  }
+
+  // Create the copy function.
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  mlir::OpBuilder modBuilder(module.getBodyRegion());
+  llvm::SmallVector<mlir::Type> argsTy = {varType, varType};
+  auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {});
+  mlir::func::FuncOp funcOp =
+      mlir::func::FuncOp::create(modBuilder, loc, copyFuncName, funcType);
+  funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
+  fir::factory::setInternalLinkage(funcOp);
+  builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
+                      {loc, loc});
+  builder.setInsertionPointToStart(&funcOp.getRegion().back());
+
+  // Load the source box.
+  Value srcBox =
+      fir::LoadOp::create(builder, loc, eleTy, funcOp.getArgument(1));
+
+  // Convert types for _FortranAAssign call.
+  Value dstConv =
+      fir::ConvertOp::create(builder, loc, refBoxNoneTy, funcOp.getArgument(0));
+  Value srcConv = fir::ConvertOp::create(builder, loc, boxNoneTy, srcBox);
+
+  // Use null source location (only used for error reporting).
+  Value nullLoc = fir::ZeroOp::create(builder, loc, refI8Ty);
+  Value zeroLine = builder.createIntegerConstant(loc, i32Ty, 0);
+
+  // Call _FortranAAssign to copy the array data.
+  fir::CallOp::create(builder, loc, assignFunc,
+                      mlir::ValueRange{dstConv, srcConv, nullLoc, zeroLine});
+
+  mlir::func::ReturnOp::create(builder, loc);
+  return funcOp;
+}
+
 static bool isUserOutsideSR(Operation *user, Operation *parentOp,
                             SingleRegion sr) {
   while (user->getParentOp() != parentOp)
@@ -380,11 +448,12 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
     return alloc;
   };
 
-  auto moveToSingle =
-      [&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder,
-          OpBuilder parallelBuilder) -> std::pair<bool, SmallVector<Value>> {
+  auto moveToSingle = [&](SingleRegion sr, OpBuilder allocaBuilder,
+                          OpBuilder singleBuilder, OpBuilder parallelBuilder)
+      -> std::tuple<bool, SmallVector<Value>, SmallPtrSet<Value, 4>> {
     IRMapping singleMapping = rootMapping;
     SmallVector<Value> copyPrivate;
+    SmallPtrSet<Value, 4> boxDataCopyVars;
     bool allParallelized = true;
 
     // "firstprivate" pointer initialization creates: (1) alloca, (2) store
@@ -408,8 +477,8 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
             writesToCopyprivateAlloca =
                 llvm::any_of(effects, [&](const auto &eff) {
                   return isa<MemoryEffects::Write>(eff.getEffect()) &&
-                         eff.getValue() &&
-                         hoistedCopyprivateAllocas.contains(eff.getValue());
+                         (!eff.getValue() ||
+                          hoistedCopyprivateAllocas.contains(eff.getValue()));
                 });
           }
         }
@@ -437,39 +506,34 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
       } else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
         if (alloca.isDynamic()) {
           // Dynamic allocas (e.g. firstprivate arrays with runtime extent)
-          // cannot use the simple load/store copyprivate copy function
-          // because it only copies a single element for sequence types like
-          // !fir.array<?xi32>. Instead, keep the alloca in the single block
-          // and broadcast its address via a box to all threads. The box
-          // preserves shape information and is semantically correct for
-          // copyprivate.
-          singleBuilder.clone(op, singleMapping);
+          // are hoisted so each thread gets its own allocation, providing
+          // true firstprivate semantics. The array data is broadcast via
+          // copyprivate using a box that carries shape information. The
+          // copyprivate copy function uses _FortranAAssign to copy the
+          // actual array data (not just the descriptor) between threads.
+          auto hoisted =
+              cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
+          rootMapping.map(&*alloca, &*hoisted);
+          rootMapping.map(alloca.getResult(), hoisted.getResult());
+
           if (isTransitivelyUsedOutside(alloca.getResult(), sr)) {
-            Value clonedResult = singleMapping.lookup(alloca.getResult());
-            if (!rootMapping.lookupOrNull(alloca.getResult())) {
-              // Create a box type wrapping the allocated array type.
-              Type eleTy =
-                  cast<fir::ReferenceType>(alloca.getType()).getEleTy();
-              auto boxTy = fir::BoxType::get(eleTy);
-              Value boxAlloc = fir::AllocaOp::create(allocaBuilder, loc, boxTy);
-              // In single: create a shape from the alloca extents, embox
-              // the array, and store the box.
-              SmallVector<Value> extents;
-              for (Value ext : alloca.getShape())
-                extents.push_back(singleMapping.lookupOrDefault(ext));
-              Value shape = fir::ShapeOp::create(singleBuilder, loc, extents);
-              Value box = fir::EmboxOp::create(singleBuilder, loc, boxTy,
-                                               clonedResult, shape);
-              fir::StoreOp::create(singleBuilder, loc, box, boxAlloc);
-              // After single: load the box and extract the address.
-              Value loadedBox =
-                  fir::LoadOp::create(parallelBuilder, loc, boxTy, boxAlloc);
-              Value addr = fir::BoxAddrOp::create(parallelBuilder, loc,
-                                                  alloca.getType(), loadedBox);
-              rootMapping.map(alloca.getResult(), addr);
-              copyPrivate.push_back(boxAlloc);
-            }
+            // Create a box slot for copyprivate to broadcast the array data.
+            Type eleTy = cast<fir::ReferenceType>(alloca.getType()).getEleTy();
+            auto boxTy = fir::BoxType::get(eleTy);
+            Value boxAlloc = fir::AllocaOp::create(allocaBuilder, loc, boxTy);
+
+            // Embox the per-thread array allocation with its shape extents.
+            Value shape =
+                fir::ShapeOp::create(allocaBuilder, loc, hoisted.getShape());
+            Value box = fir::EmboxOp::create(allocaBuilder, loc, boxTy,
+                                             hoisted.getResult(), shape);
+            fir::StoreOp::create(allocaBuilder, loc, box, boxAlloc);
+
+            copyPrivate.push_back(boxAlloc);
+            boxDataCopyVars.insert(boxAlloc);
           }
+
+          hoistedCopyprivateAllocas.insert(alloca.getResult());
           allParallelized = false;
         } else {
           auto hoisted =
@@ -496,7 +560,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
       }
     }
     omp::TerminatorOp::create(singleBuilder, loc);
-    return {allParallelized, copyPrivate};
+    return {allParallelized, copyPrivate, boxDataCopyVars};
   };
 
   for (Block &block : sourceRegion) {
@@ -557,7 +621,7 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
         Block *parallelBlock = new Block();
         parallelBuilder.setInsertionPointToStart(parallelBlock);
 
-        auto [allParallelized, copyprivateVars] =
+        auto [allParallelized, copyprivateVars, boxDataCopyVars] =
             moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder,
                          singleBuilder, parallelBuilder);
         if (allParallelized) {
@@ -574,7 +638,10 @@ static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
           cleanupBlock(singleBlock);
           for (auto var : singleOperands.copyprivateVars) {
             mlir::func::FuncOp funcOp =
-                createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
+                boxDataCopyVars.contains(var)
+                    ? createBoxDataCopyFunc(loc, var.getType(),
+                                            firCopyFuncBuilder)
+                    : createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
             singleOperands.copyprivateSyms.push_back(
                 SymbolRefAttr::get(funcOp));
           }
diff --git a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90 b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
index eb266a450e55d..b8c9dc13a1e90 100644
--- a/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
+++ b/flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
@@ -9,7 +9,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix HLFIR
 !RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix FIR
 !RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefix LLVM
-!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | fir-opt --lower-workshare --allow-unregistered-dialect -o - | FileCheck %s --check-prefix FIROPT
 
 ! Test that parallel workshare with firstprivate(P) where P is a pointer
 ! correctly places stores through the pointer target in omp.single rather
@@ -87,80 +86,6 @@ subroutine test_workshare_firstprivate_pointer(P)
 ! FIR: }
 ! FIR: return
 
-! FIROPT: func.func @_QPtest_workshare_firstprivate_pointer(
-! FIROPT-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-
-! FIROPT: %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! FIROPT: %[[I_ALLOC:.*]] = fir.alloca i32
-! FIROPT: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_ALLOC]]
-
-! FIROPT: %[[P_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1
-! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
-
-! FIROPT: omp.parallel {
-
-! FIROPT: %[[P_PRIV:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "p", pinned
-
-! FIROPT: omp.single copyprivate(%[[P_PRIV]] -> @_workshare_copy_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) nowait {
-
-! FIROPT: %[[ZERO_PTR:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
-! FIROPT: %[[C0:.*]] = arith.constant 0 : index
-! FIROPT: %[[SHAPE:.*]] = fir.shape %[[C0]] : (index) -> !fir.shape<1>
-! FIROPT: %[[EMBOX:.*]] = fir.embox %[[ZERO_PTR]](%[[SHAPE]])
-! FIROPT: fir.store %[[EMBOX]] to %[[P_PRIV]]
-
-! FIROPT: %[[P_FP:.*]]:2 = hlfir.declare %[[P_PRIV]]
-! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
-
-! FIROPT: %[[LOAD_ORIG:.*]] = fir.load %[[P_DECL]]#0
-! FIROPT: fir.store %[[LOAD_ORIG]] to %[[P_FP]]#0
-
-! FIROPT: %[[C1:.*]] = arith.constant 1 : i32
-
-! FIROPT: %[[LOAD_PRIV:.*]] = fir.load %[[P_FP]]#0
-! FIROPT: %[[C0_2:.*]] = arith.constant 0 : index
-! FIROPT: %[[DIMS:.*]]:3 = fir.box_dims %[[LOAD_PRIV]], %[[C0_2]]
-
-! FIROPT: %[[EXT64:.*]] = fir.convert %[[DIMS]]#1 : (index) -> i64
-! FIROPT: %[[EXT32:.*]] = fir.convert %[[EXT64]] : (i64) -> i32
-
-! FIROPT: hlfir.forall lb {
-! FIROPT: hlfir.yield %[[C1]] : i32
-! FIROPT: } ub {
-! FIROPT: hlfir.yield %[[EXT32]] : i32
-! FIROPT: }  (%[[IV:.*]]: i32) {
-
-! FIROPT: %[[IDX:.*]] = hlfir.forall_index "i" %[[IV]] : (i32) -> !fir.ref<i32>
-
-! FIROPT: hlfir.region_assign {
-
-! FIROPT: %[[IDX_VAL:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
-! FIROPT: hlfir.yield %[[IDX_VAL]] : i32
-
-! FIROPT: } to {
-
-! FIROPT: %[[LOAD_BOX:.*]] = fir.load %[[P_FP]]#0
-! FIROPT: %[[LOAD_I:.*]] = fir.load %[[IDX]] : !fir.ref<i32>
-! FIROPT: %[[IDX64:.*]] = fir.convert %[[LOAD_I]] : (i32) -> i64
-
-! FIROPT: %[[DESIG:.*]] = hlfir.designate %[[LOAD_BOX]] (%[[IDX64]])
-! FIROPT-SAME: -> !fir.ref<i32>
-
-! FIROPT: hlfir.yield %[[DESIG]] : !fir.ref<i32>
-
-! FIROPT: }
-! FIROPT: }
-
-! FIROPT: omp.terminator
-! FIROPT: }
-
-! FIROPT: %[[POST_DECL:.*]]:2 = hlfir.declare %[[P_PRIV]]
-! FIROPT-SAME: fortran_attrs = #fir.var_attrs<intent_in, pointer>
-
-! FIROPT: omp.barrier
-! FIROPT: omp.terminator
-! FIROPT: }
-
 ! At LLVM IR level, verify the OpenMP fork call exists and the loop body
 ! is inside the outlined function.
 ! LLVM:       call void {{.*}}__kmpc_fork_call({{.*}}@test_workshare_firstprivate_pointer_..omp_par{{.*}})
@@ -195,31 +120,29 @@ subroutine test_workshare_firstprivate_array(a, z, n)
 end subroutine
 
 ! After workshare lowering, the dynamic alloca for the firstprivate copy
-! must be inside omp.single, with its address broadcast via a !fir.box
-! indirection alloca + copyprivate.
+! is hoisted so each thread gets its own allocation (true firstprivate).
+! The array data is broadcast via copyprivate using a box with a
+! data-copying function (_FortranAAssign).
 ! FIR-LABEL:     {{.*}}test_workshare_firstprivate_array(
 ! FIR:           %[[C1:.*]] = arith.constant 1 : i32
 ! FIR-LABEL:     omp.parallel {
 
-! The box indirection alloca is hoisted for copyprivate
-! FIR:       omp.single copyprivate(%[[BOX_INDIRECT:.*]] -> @_workshare_copy_box_Uxi32{{.*}}) {
-
-! The dynamic alloca (firstprivate copy) is inside the single block
-! FIR:         %[[FP_ARRAY:.*]] = fir.alloca{{.*}}
+! The dynamic alloca is hoisted (per-thread allocation)
+! FIR:       %[[FP_ARRAY:.*]] = fir.alloca !fir.array<?xi32>
+! The box slot and embox are hoisted for copyprivate
+! FIR:       %[[BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+! FIR:       %[[SHAPE:.*]] = fir.shape %{{.*}}
+! FIR:       %[[BOX_VAL:.*]] = fir.embox %[[FP_ARRAY]](%[[SHAPE]]){{.*}}
+! FIR:       fir.store %[[BOX_VAL]] to %[[BOX_SLOT]]
 
-! Runtime shape construction for the firstprivate array.
-! FIR:         %[[SHAPE:.*]] = fir.shape %{{.*}}
-! FIR:         %[[BOX_VAL:.*]] = fir.embox %[[FP_ARRAY]](%[[SHAPE]]){{.*}}fir.array<?xi32>{{.*}}
-! FIR:         fir.store %[[BOX_VAL]] to %[[BOX_INDIRECT]] {{.*}}fir.array<?xi32>{{.*}}
+! Copyprivate uses box-data copy function to broadcast array contents
+! FIR:       omp.single copyprivate(%[[BOX_SLOT]] -> @_workshare_copy_data_box_Uxi32{{.*}}) {
 
-! The initialization of the firstprivate copy
+! The initialization of the firstprivate copy (single thread only)
 ! FIR:         fir.call @_FortranAAssign
 ! FIR:         omp.terminator
 ! FIR:       }
-! After single, the box is loaded and the address extracted
-! FIR:       %[[LOADED_BOX:.*]] = fir.load %[[BOX_INDIRECT]]{{.*}}fir.array<?xi32>{{.*}}
-! FIR:       %[[ARRAY_ADDR:.*]] = fir.box_addr %[[LOADED_BOX]]{{.*}}fir.array<?xi32>>{{.*}}
-! The workshared loop uses the broadcast address
+! The workshared loop uses the per-thread allocation directly
 ! FIR:       omp.wsloop {
 ! FIR:         %[[SRC_ELEM:.*]] = fir.array_coor %{{.*}}(%{{.*}}) %{{.*}}
 ! FIR:         %[[SRC_VAL:.*]] = fir.load %[[SRC_ELEM]]
@@ -230,3 +153,264 @@ subroutine test_workshare_firstprivate_array(a, z, n)
 ! FIR:       omp.barrier
 ! FIR:       omp.terminator
 ! FIR:       return
+
+subroutine allocatable_example()
+  implicit none
+
+  integer, allocatable :: p(:)
+  integer :: a(4)
+
+  allocate(p(4))
+  p = [1, 2, 3, 4]
+
+  !$omp parallel workshare firstprivate(p)
+    a = p + 1
+  !$omp end parallel workshare
+end subroutine
+
+! HLFIR-LABEL: func.func @_QPallocatable_example() {
+! HLFIR: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {{.*}}uniq_name = "_QFallocatable_exampleEa"
+! HLFIR: %[[ORIG_P:.*]]:2 = hlfir.declare %{{.*}} {{.*}}fortran_attrs = #fir.var_attrs<allocatable>{{.*}}uniq_name = "_QFallocatable_exampleEp"
+
+! Initial allocation/assignment of original p
+! HLFIR: fir.allocmem !fir.array<?xi32>
+! HLFIR: fir.store %{{.*}} to %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: hlfir.assign %{{.*}} to %[[ORIG_P]]#0 realloc : {{.*}}, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+
+! HLFIR: omp.parallel {
+! HLFIR:   omp.workshare {
+
+! Firstprivate allocatable descriptor
+! HLFIR:     %[[FP_ALLOCA:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {{.*}}bindc_name = "p"{{.*}}
+
+! Allocate/init firstprivate copy depending on original allocation status
+! HLFIR:     %[[ORIG_VAL0:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     %[[ORIG_ADDR0:.*]] = fir.box_addr %[[ORIG_VAL0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR:     fir.convert %[[ORIG_ADDR0]]
+! HLFIR:     arith.cmpi ne
+! HLFIR:     fir.if %{{.*}} {
+! HLFIR:       %[[ORIG_VAL1:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:       fir.box_dims %[[ORIG_VAL1]]
+! HLFIR:       fir.allocmem !fir.array<?xi32>
+! HLFIR:       fir.embox
+! HLFIR:       fir.store %{{.*}} to %[[FP_ALLOCA]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     } else {
+! HLFIR:       fir.zero_bits !fir.heap<!fir.array<?xi32>>
+! HLFIR:       fir.embox
+! HLFIR:       fir.store %{{.*}} to %[[FP_ALLOCA]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     }
+
+! Declare firstprivate p
+! HLFIR:     %[[FP_DECL:.*]]:2 = hlfir.declare %[[FP_ALLOCA]] {{.*}}fortran_attrs = #fir.var_attrs<allocatable>{{.*}}uniq_name = "_QFallocatable_exampleEp"
+
+! Copy original p into firstprivate p
+! HLFIR:     %[[FP_VAL0:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     %[[FP_ADDR0:.*]] = fir.box_addr %[[FP_VAL0]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR:     fir.convert %[[FP_ADDR0]]
+! HLFIR:     arith.cmpi ne
+! HLFIR:     fir.if %{{.*}} {
+! HLFIR:       %[[ORIG_VAL2:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:       hlfir.assign %[[ORIG_VAL2]] to %[[FP_DECL]]#0 realloc : !fir.box<!fir.heap<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     }
+
+! Use firstprivate p in: a = p + 1
+! HLFIR:     %[[FP_VAL1:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     %[[EXPR:.*]] = hlfir.elemental
+! HLFIR:       hlfir.designate %[[FP_VAL1]]
+! HLFIR:       fir.load
+! HLFIR:       arith.addi
+! HLFIR:       hlfir.yield_element
+! HLFIR:     hlfir.assign %[[EXPR]] to %[[A]]#0 : !hlfir.expr<?xi32>, !fir.ref<!fir.array<4xi32>>
+! HLFIR:     hlfir.destroy %[[EXPR]] : !hlfir.expr<?xi32>
+
+! Cleanup firstprivate p
+! HLFIR:     %[[FP_VAL2:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     %[[FP_ADDR1:.*]] = fir.box_addr %[[FP_VAL2]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR:     fir.convert %[[FP_ADDR1]]
+! HLFIR:     arith.cmpi ne
+! HLFIR:     fir.if %{{.*}} {
+! HLFIR:       %[[FP_VAL3:.*]] = fir.load %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:       %[[FP_ADDR2:.*]] = fir.box_addr %[[FP_VAL3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR:       fir.freemem %[[FP_ADDR2]] : !fir.heap<!fir.array<?xi32>>
+! HLFIR:       fir.store %{{.*}} to %[[FP_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:     }
+! HLFIR:     omp.terminator
+! HLFIR:   }
+! HLFIR:   omp.terminator
+! HLFIR: }
+
+! Final cleanup of original p
+! HLFIR: %[[ORIG_VAL3:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: %[[ORIG_ADDR1:.*]] = fir.box_addr %[[ORIG_VAL3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR: fir.convert %[[ORIG_ADDR1]]
+! HLFIR: arith.cmpi ne
+! HLFIR: fir.if %{{.*}} {
+! HLFIR:   %[[ORIG_VAL4:.*]] = fir.load %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR:   %[[ORIG_ADDR2:.*]] = fir.box_addr %[[ORIG_VAL4]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! HLFIR:   fir.freemem %[[ORIG_ADDR2]] : !fir.heap<!fir.array<?xi32>>
+! HLFIR:   fir.store %{{.*}} to %[[ORIG_P]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! HLFIR: }
+
+! HLFIR: return
+! HLFIR: }
+
+! FIR-LABEL: func.func @_QPallocatable_example()
+
+! FIR:           %[[C1_I32:.*]] = arith.constant 1 : i32
+
+! Original allocatable p declaration/allocation
+! FIR:           %[[A_DECL:.*]] = fir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFallocatable_exampleEa"}
+! FIR:           %[[ORIG_P:.*]] = fir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocatable_exampleEp"}
+! FIR:           fir.allocmem !fir.array<?xi32>
+! FIR:           fir.store %{{.*}} to %[[ORIG_P]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR:           fir.call @_FortranAAssign
+
+! FIR:           omp.parallel {
+
+! Allocas for copyprivate slots
+! FIR:             %[[A_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<4xi32>>
+! FIR:             %[[FP_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "p", pinned
+! FIR:             %[[COPY_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
+! FIR:             %[[COPY_HEAP_SLOT:.*]] = fir.alloca !fir.heap<!fir.array<?xi32>>
+
+! Copyprivate with three slots for broadcasting firstprivate data
+! FIR:             omp.single copyprivate(%[[FP_BOX_SLOT]] -> @_workshare_copy_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %[[COPY_BOX_SLOT]] -> @_workshare_copy_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, %[[COPY_HEAP_SLOT]] -> @_workshare_copy_heap_Uxi32 : !fir.ref<!fir.heap<!fir.array<?xi32>>>) {
+
+! Check original allocation status and allocate firstprivate copy
+! FIR:               %[[ORIG_BOX0:.*]] = fir.load %[[ORIG_P]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR:               fir.box_addr %[[ORIG_BOX0]]
+! FIR:               arith.cmpi ne
+! FIR:               fir.if %{{.*}} {
+! FIR:                 fir.load %[[ORIG_P]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR:                 fir.box_dims
+! FIR:                 fir.allocmem !fir.array<?xi32>
+! FIR:                 fir.store %{{.*}} to %[[FP_BOX_SLOT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR:               } else {
+! FIR:                 fir.zero_bits !fir.heap<!fir.array<?xi32>>
+! FIR:                 fir.store %{{.*}} to %[[FP_BOX_SLOT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR:               }
+
+! Declare firstprivate p and copy data from original
+! FIR:               %[[FP_DECL:.*]] = fir.declare %[[FP_BOX_SLOT]] {fortran_attrs = #fir.var_attrs<allocatable>
+! FIR:               fir.load %[[FP_DECL]]
+! FIR:               fir.box_addr
+! FIR:               arith.cmpi ne
+! FIR:               fir.if %{{.*}} {
+! FIR:                 fir.call @_FortranAAssign
+! FIR:               }
+
+! Store to copyprivate broadcast slots
+! FIR:               fir.store %{{.*}} to %[[COPY_BOX_SLOT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! FIR:               fir.store %{{.*}} to %[[COPY_HEAP_SLOT]] : !fir.ref<!fir.heap<!fir.array<?xi32>>>
+! FIR:               omp.terminator
+! FIR:             }
+
+! After single: use copyprivate broadcast data for workshared computation
+! FIR:             %[[FP_DECL2:.*]] = fir.declare %[[FP_BOX_SLOT]] {fortran_attrs = #fir.var_attrs<allocatable>
+! FIR:             %[[COPY_BOX:.*]] = fir.load %[[COPY_BOX_SLOT]]
+
+! Workshared loop: temp = p + 1
+! FIR:             omp.wsloop {
+! FIR:               omp.loop_nest (%[[I:.*]]) : index
+! FIR:                 %[[SRC_VAL:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! FIR:                 %[[ADD_RES:.*]] = arith.addi %[[SRC_VAL]], %[[C1_I32]] : i32
+! FIR:                 fir.store %[[ADD_RES]] to %{{.*}} : !fir.ref<i32>
+! FIR:                 omp.yield
+! FIR:             }
+
+! Assignment of temp to a and cleanup (in omp.single nowait)
+! FIR:             omp.single nowait {
+! FIR:               fir.call @_FortranAAssign
+! FIR:               fir.freemem
+! Cleanup firstprivate p
+! FIR:               fir.if %{{.*}} {
+! FIR:                 fir.freemem
+! FIR:               }
+! FIR:               omp.terminator
+! FIR:             }
+
+! FIR:             omp.barrier
+! FIR:             omp.terminator
+! FIR:           }
+
+! Cleanup original p
+! FIR:           fir.if %{{.*}} {
+! FIR:             fir.freemem
+! FIR:           }
+! FIR:           return
+
+subroutine derived_type_example()
+  implicit none
+
+  type :: t
+    integer :: x
+  end type
+
+  type(t) :: p(4)
+  integer :: a(4)
+
+  p%x = [1, 2, 3, 4]
+
+  !$omp parallel workshare firstprivate(p)
+    a = p%x + 1
+  !$omp end parallel workshare
+end subroutine
+
+! FIR-LABEL: func.func @_QPderived_type_example()
+! FIR:           %[[C1_I32:.*]] = arith.constant 1 : i32
+! FIR:           %[[A_DECL:.*]] = fir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFderived_type_exampleEa"}
+! FIR:           %[[ORIG_P:.*]] = fir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFderived_type_exampleEp"}
+! FIR:           fir.call @_FortranAAssign
+! FIR:           omp.parallel {
+
+! Allocas for copyprivate slots
+! FIR:             %[[A_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<4xi32>>
+! FIR:             %[[P_BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<4x!fir.type<{{.*}}>>>
+! FIR:             %[[FP_ARRAY:.*]] = fir.alloca !fir.array<4x!fir.type<{{.*}}>> {bindc_name = "p", pinned
+! FIR:             %[[HEAP_SLOT:.*]] = fir.alloca !fir.heap<!fir.array<4xi32>>
+
+! Copyprivate with derived-type copy function and heap copy function
+! FIR:             omp.single copyprivate(%[[FP_ARRAY]] -> @_workshare_copy_4xrec__QFderived_type_exampleTt : {{.*}}, %[[HEAP_SLOT]] -> @_workshare_copy_heap_4xi32 : {{.*}}) {
+
+! Declare firstprivate p and copy original data
+! FIR:               fir.declare %[[FP_ARRAY]]
+! FIR:               fir.call @_FortranAAssign
+
+! Allocate temp array for expression result
+! FIR:               fir.allocmem !fir.array<4xi32>
+! FIR:               fir.store %{{.*}} to %[[HEAP_SLOT]]
+! FIR:               omp.terminator
+! FIR:             }
+
+! After single: declare firstprivate p and extract p%x via slice
+! FIR:             %[[FP_DECL:.*]] = fir.declare %[[FP_ARRAY]]
+! FIR:             fir.field_index x, !fir.type<{{.*}}>
+! FIR:             fir.slice
+! FIR:             %[[PX_ADDR:.*]] = fir.box_addr
+
+! Load temp array from copyprivate slot
+! FIR:             %[[HEAP_VAL:.*]] = fir.load %[[HEAP_SLOT]]
+! FIR:             %[[TMP_DECL:.*]] = fir.declare %[[HEAP_VAL]]
+
+! Workshared loop: temp = p%x + 1
+! FIR:             omp.wsloop {
+! FIR:               omp.loop_nest (%[[I:.*]]) : index
+! FIR:                 %[[SRC_ELEM:.*]] = fir.array_coor %[[PX_ADDR]]
+! FIR:                 %[[SRC_VAL:.*]] = fir.load %[[SRC_ELEM]] : !fir.ref<i32>
+! FIR:                 %[[ADD_RES:.*]] = arith.addi %[[SRC_VAL]], %[[C1_I32]] : i32
+! FIR:                 %[[DST_ELEM:.*]] = fir.array_coor %[[TMP_DECL]]
+! FIR:                 fir.store %[[ADD_RES]] to %[[DST_ELEM]] : !fir.ref<i32>
+! FIR:                 omp.yield
+
+! Assignment of temp to a and cleanup (in omp.single nowait)
+! FIR:             omp.single nowait {
+! FIR:               fir.call @_FortranAAssign
+! FIR:               fir.freemem
+! FIR:               omp.terminator
+! FIR:             }
+
+! FIR:             omp.barrier
+! FIR:             omp.terminator
+! FIR:           }
+
+! FIR:           return
diff --git a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
index e8cb9065123e7..124077a9b92ac 100644
--- a/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
+++ b/flang/test/Transforms/OpenMP/lower-workshare-thread-local.mlir
@@ -488,5 +488,76 @@ func.func @pointer_descriptor_store_is_thread_local() {
 
 // Test for "parallel workshare firstprivate(z)" where z is an array.
 // Check that z is broadcast to all private values of the threads.
-// This test is now part of flang/test/Integration/OpenMP/workshare-firstprivate-pointer.f90
 
+// The copy function uses _FortranAAssign to broadcast array data.
+// CHECK-LABEL: func.func private @_workshare_copy_data_box_Uxi32(
+// CHECK: fir.call @_FortranAAssign
+// CHECK: return
+
+// CHECK-LABEL: func.func @dynamic_alloca_firstprivate_array(
+// CHECK-SAME: %[[N:.*]]: index,
+// CHECK-SAME: %[[SRC:.*]]: !fir.ref<!fir.array<?xi32>>,
+// CHECK-SAME: %[[DST:.*]]: !fir.ref<!fir.array<?xi32>>)
+func.func @dynamic_alloca_firstprivate_array(%n: index, %src: !fir.ref<!fir.array<?xi32>>, %dst: !fir.ref<!fir.array<?xi32>>) {
+  omp.parallel {
+    omp.workshare {
+      // Dynamic alloca for the firstprivate array copy
+      %z = fir.alloca !fir.array<?xi32>, %n {bindc_name = "z", pinned}
+      %shape = fir.shape %n : (index) -> !fir.shape<1>
+      %decl = fir.declare %z(%shape) {uniq_name = "z"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
+      // A side-effecting op that initializes the firstprivate copy
+      "test.init"(%decl, %src) : (!fir.ref<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>) -> ()
+      // Workshared loop that reads from the firstprivate array
+      %c1 = arith.constant 1 : index
+      omp.workshare.loop_wrapper {
+        omp.loop_nest (%i) : index = (%c1) to (%n) inclusive step (%c1) {
+          %elem = fir.array_coor %decl(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+          %val = fir.load %elem : !fir.ref<i32>
+          %dst_elem = fir.array_coor %dst(%shape) %i : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+          fir.store %val to %dst_elem : !fir.ref<i32>
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK: omp.parallel {
+
+// The dynamic alloca is hoisted so each thread gets its own allocation.
+// CHECK: %[[ARRAY:.*]] = fir.alloca !fir.array<?xi32>, %[[N]]
+// CHECK-SAME: bindc_name = "z"
+// CHECK-SAME: pinned
+
+// A box slot is used for copyprivate to broadcast the array data.
+// CHECK: %[[BOX_SLOT:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
+// CHECK: %[[SHAPE0:.*]] = fir.shape %[[N]]
+// CHECK: %[[BOX:.*]] = fir.embox %[[ARRAY]](%[[SHAPE0]]) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK: fir.store %[[BOX]] to %[[BOX_SLOT]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
+
+// Initialization happens only inside the single block.
+// CHECK: omp.single copyprivate(%[[BOX_SLOT]] -> @_workshare_copy_data_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>>) {
+// CHECK: %[[SHAPE1:.*]] = fir.shape %[[N]]
+// CHECK: %[[DECL1:.*]] = fir.declare %[[ARRAY]](%[[SHAPE1]])
+// CHECK: "test.init"(%[[DECL1]], %[[SRC]])
+// CHECK: omp.terminator
+// CHECK: }
+
+// The workshared loop uses the hoisted per-thread array.
+// CHECK: %[[SHAPE2:.*]] = fir.shape %[[N]]
+// CHECK: %[[DECL2:.*]] = fir.declare %[[ARRAY]](%[[SHAPE2]])
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: omp.wsloop nowait
+// CHECK: omp.loop_nest (%[[I:.*]]) : index = (%[[C1]]) to (%[[N]]) inclusive step (%[[C1]])
+// CHECK: %[[ELEM:.*]] = fir.array_coor %[[DECL2]](%[[SHAPE2]]) %[[I]] : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK: %[[VAL:.*]] = fir.load %[[ELEM]] : !fir.ref<i32>
+// CHECK: %[[DST_ELEM:.*]] = fir.array_coor %[[DST]](%[[SHAPE2]]) %[[I]] : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+// CHECK: fir.store %[[VAL]] to %[[DST_ELEM]] : !fir.ref<i32>
+// CHECK: omp.yield
+
+// CHECK: omp.barrier
+// CHECK: omp.terminator
+// CHECK: return

>From f72cc63c615cdbfcd46838625dadc0033556f332 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <kuravina at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 3 Jun 2026 06:07:15 -0500
Subject: [PATCH 6/6] Using fir::runtime::genAssign instead of explicit call to
 _FortranAAssign. This helps in inlining code at higher optimization (-O1, -O2
 etc) builds.

---
 flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp | 31 +++----------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
index 50463810d5025..7c8a3059329bb 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
@@ -18,6 +18,7 @@
 
 #include <flang/Optimizer/Analysis/AliasAnalysis.h>
 #include <flang/Optimizer/Builder/FIRBuilder.h>
+#include <flang/Optimizer/Builder/Runtime/Assign.h>
 #include <flang/Optimizer/Dialect/FIROps.h>
 #include <flang/Optimizer/Dialect/FIRType.h>
 #include <flang/Optimizer/HLFIR/HLFIROps.h>
@@ -318,23 +319,6 @@ static mlir::func::FuncOp createBoxDataCopyFunc(mlir::Location loc,
   if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
     return decl;
 
-  // Ensure _FortranAAssign is declared in the module.
-  auto boxNoneTy = fir::BoxType::get(builder.getNoneType());
-  auto refBoxNoneTy = fir::ReferenceType::get(boxNoneTy);
-  auto refI8Ty = fir::ReferenceType::get(builder.getIntegerType(8));
-  auto i32Ty = builder.getI32Type();
-  llvm::StringRef assignFuncName = "_FortranAAssign";
-  auto assignFunc = module.lookupSymbol<mlir::func::FuncOp>(assignFuncName);
-  if (!assignFunc) {
-    mlir::OpBuilder::InsertionGuard g(builder);
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-    auto assignFuncType = mlir::FunctionType::get(
-        builder.getContext(), {refBoxNoneTy, boxNoneTy, refI8Ty, i32Ty}, {});
-    assignFunc = mlir::func::FuncOp::create(modBuilder, loc, assignFuncName,
-                                            assignFuncType);
-    assignFunc.setVisibility(mlir::SymbolTable::Visibility::Private);
-  }
-
   // Create the copy function.
   mlir::OpBuilder::InsertionGuard guard(builder);
   mlir::OpBuilder modBuilder(module.getBodyRegion());
@@ -348,22 +332,17 @@ static mlir::func::FuncOp createBoxDataCopyFunc(mlir::Location loc,
                       {loc, loc});
   builder.setInsertionPointToStart(&funcOp.getRegion().back());
 
-  // Load the source box.
+  // Load the source box and use the runtime helper to generate the assign.
   Value srcBox =
       fir::LoadOp::create(builder, loc, eleTy, funcOp.getArgument(1));
 
-  // Convert types for _FortranAAssign call.
+  auto boxNoneTy = fir::BoxType::get(builder.getNoneType());
+  auto refBoxNoneTy = fir::ReferenceType::get(boxNoneTy);
   Value dstConv =
       fir::ConvertOp::create(builder, loc, refBoxNoneTy, funcOp.getArgument(0));
   Value srcConv = fir::ConvertOp::create(builder, loc, boxNoneTy, srcBox);
 
-  // Use null source location (only used for error reporting).
-  Value nullLoc = fir::ZeroOp::create(builder, loc, refI8Ty);
-  Value zeroLine = builder.createIntegerConstant(loc, i32Ty, 0);
-
-  // Call _FortranAAssign to copy the array data.
-  fir::CallOp::create(builder, loc, assignFunc,
-                      mlir::ValueRange{dstConv, srcConv, nullLoc, zeroLine});
+  fir::runtime::genAssign(builder, loc, dstConv, srcConv);
 
   mlir::func::ReturnOp::create(builder, loc);
   return funcOp;