[flang-commits] [flang] [mlir] [Flang][OpenMP] Support conditional lastprivate on host (PR #200086)

Wed May 27 16:58:48 PDT 2026

llvmorg-github-actions[bot] wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Sunil Shrestha (sshrestha-aa)

<details>
<summary>Changes</summary>

This patch lowers lastprivate(conditional:) on the host by leveraging the existing user-defined reduction (UDR) infrastructure. A packed struct is created where each thread tracks, for every lastprivate variable, both the candidate value and the canonical iteration index of its last update. The reduction combiner selects the value from the sequentially later iteration (for do loops) or from the lexically later section (for sections).

The implementation locates the enclosing omp.parallel and places the shared struct before it so that all threads in the team reduce into the same storage. For orphaned worksharing constructs — where no enclosing parallel is visible at compile time — a module-scope global of the struct type is used instead. This is correct for a single level of parallelism, but concurrent nested teams executing the same orphaned construct would race on the shared global. This limitation mirrors the current Clang behavior, which also uses a single global and does not support nested parallelism for conditional lastprivate.

Assisted-by : Claude Opus 4.6

---

Patch is 65.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/200086.diff


13 Files Affected:

- (modified) flang/lib/Lower/OpenMP/DataSharingProcessor.cpp (+12-3) 
- (modified) flang/lib/Lower/OpenMP/DataSharingProcessor.h (+6) 
- (modified) flang/lib/Lower/OpenMP/OpenMP.cpp (+791-6) 
- (modified) flang/lib/Lower/Support/ReductionProcessor.cpp (+7) 
- (removed) flang/test/Lower/OpenMP/Todo/lastprivate-conditional.f90 (-12) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-sections-nowait.f90 (+37) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-sections-orphaned.f90 (+76) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-sections.f90 (+80) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-wsloop-nested-if.f90 (+36) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-wsloop-nowait.f90 (+38) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-wsloop-orphaned.f90 (+73) 
- (added) flang/test/Lower/OpenMP/lastprivate-conditional-wsloop.f90 (+81) 
- (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+1-2) 


``````````diff

diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index e392497d30de7..da2b0582e22a3 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -278,10 +278,19 @@ void DataSharingProcessor::collectSymbolsForPrivatization() {
                                  explicitlyPrivatizedSymbols);
     } else if (const auto &lastPrivateClause =
                    std::get_if<omp::clause::Lastprivate>(&clause.u)) {
-      lastprivateModifierNotSupported(*lastPrivateClause,
-                                      converter.getCurrentLocation());
+      auto &modifier = std::get<
+          std::optional<omp::clause::Lastprivate::LastprivateModifier>>(
+          lastPrivateClause->t);
+
       const ObjectList &objects = std::get<ObjectList>(lastPrivateClause->t);
-      collectOmpObjectListSymbol(objects, explicitlyPrivatizedSymbols);
+      if (modifier &&
+          *modifier ==
+              omp::clause::Lastprivate::LastprivateModifier::Conditional) {
+        // conditional lastprivate path
+        collectOmpObjectListSymbol(objects, conditionalLastPrivatizedSymbols);
+      } else {
+        collectOmpObjectListSymbol(objects, explicitlyPrivatizedSymbols);
+      }
     }
   }
 
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 5dd564d4bbb61..f889adce0f049 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -97,6 +97,7 @@ class DataSharingProcessor {
   llvm::SetVector<const semantics::Symbol *> explicitlyPrivatizedSymbols;
   llvm::SetVector<const semantics::Symbol *> defaultSymbols;
   llvm::SetVector<const semantics::Symbol *> allPrivatizedSymbols;
+  llvm::SetVector<const semantics::Symbol *> conditionalLastPrivatizedSymbols;
 
   lower::AbstractConverter &converter;
   semantics::SemanticsContext &semaCtx;
@@ -193,6 +194,11 @@ class DataSharingProcessor {
   void privatizeSymbol(const semantics::Symbol *symToPrivatize,
                        mlir::omp::PrivateClauseOps *clauseOps,
                        std::optional<llvm::omp::Directive> dir = std::nullopt);
+
+  const llvm::SetVector<const semantics::Symbol *> &
+  getConditionalLastprivateSymbols() const {
+    return conditionalLastPrivatizedSymbols;
+  }
 };
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 266b06f353675..4f7abaf9d7137 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -52,12 +52,45 @@
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Support/StateStack.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include <atomic>
 
 using namespace Fortran::lower::omp;
 using namespace Fortran::common::openmp;
 using namespace Fortran::utils::openmp;
 
+// Forward declarations
+static fir::RecordType buildConditionalLpType(
+    Fortran::lower::AbstractConverter &converter,
+    const llvm::SetVector<const Fortran::semantics::Symbol *> &condLpSyms,
+    mlir::Location loc);
+
+static mlir::omp::DeclareReductionOp buildConditionalLastPrivateReduction(
+    Fortran::lower::AbstractConverter &converter, fir::RecordType lpCondType,
+    const llvm::SetVector<const Fortran::semantics::Symbol *> &condLpSyms);
+
+static void rewriteConditionalLpAssignsInWsLoops(
+    Fortran::lower::AbstractConverter &converter, mlir::omp::WsloopOp wsloopOp,
+    fir::RecordType lpType,
+    const llvm::MapVector<mlir::Value, std::string> &condLpOrigAddrs,
+    mlir::Location loc);
+
+static void rewriteConditionalLpAssignsInSections(
+    Fortran::lower::AbstractConverter &converter,
+    mlir::omp::SectionsOp sectionsOp, fir::RecordType lpType,
+    const llvm::MapVector<mlir::Value, std::string> &condLpOrigAddrs,
+    mlir::Location loc);
+
+static void initConditionalLpStruct(fir::FirOpBuilder &builder,
+                                    mlir::Location loc,
+                                    fir::RecordType lpCondType,
+                                    mlir::Value structRef);
+
+static mlir::Value
+getOrCreateConditionalLpGlobal(Fortran::lower::AbstractConverter &converter,
+                               mlir::Location loc, fir::RecordType lpType);
+
 //===----------------------------------------------------------------------===//
 // Code generation helper functions
 //===----------------------------------------------------------------------===//
@@ -415,6 +448,16 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter,
     llvm::SmallVector<const semantics::Symbol *> processedSyms;
     for (const Object &object : objects) {
       const semantics::Symbol *sym = object.sym();
+      if (!sym) {
+        // Null sentinel: this entry corresponds to a compiler-synthesized
+        // reduction (e.g. the conditional lastprivate struct) that has no
+        // Fortran symbol.  We must keep a placeholder so that processedSyms
+        // stays in lock-step with `vars` and `args` — the later
+        // llvm::zip_equal(processedSyms, vars, args) asserts equal lengths.
+        // The matching block argument is silently skipped below.
+        processedSyms.push_back(nullptr);
+        continue;
+      }
       if (const auto *commonDet =
               sym->detailsIf<semantics::CommonBlockDetails>()) {
         llvm::transform(commonDet->objects(), std::back_inserter(processedSyms),
@@ -424,7 +467,9 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter,
       }
     }
 
-    for (auto [sym, var, arg] : llvm::zip_equal(processedSyms, vars, args))
+    for (auto [sym, var, arg] : llvm::zip_equal(processedSyms, vars, args)) {
+      if (!sym)
+        continue; // Skip synthetic reduction entries.
       converter.bindSymbol(
           *sym,
           hlfir::translateToExtendedValue(
@@ -432,6 +477,7 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter,
               /*contiguousHint=*/
               evaluate::IsSimplyContiguous(*sym, converter.getFoldingContext()))
               .first);
+    }
   };
 
   // Process in clause name alphabetical order to match block arguments order.
@@ -2645,6 +2691,26 @@ genScanOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                                    converter.getCurrentLocation(), clauseOps);
 }
 
+// Forward declaration.
+static void
+emitNestedParallelGuardForCondLp(lower::AbstractConverter &converter,
+                                 mlir::Location loc);
+
+/// Walk up the parent-op chain from the current insertion point and return
+/// the nearest enclosing \c omp::ParallelOp, or \c nullptr if none exists
+/// (i.e. the construct is orphaned).  The walk handles intervening ops such
+/// as \c fir::IfOp that may appear between the worksharing construct and its
+/// enclosing parallel region.
+static mlir::omp::ParallelOp
+findEnclosingParallelOp(fir::FirOpBuilder &builder) {
+  for (auto *op = builder.getInsertionBlock()->getParentOp(); op;
+       op = op->getParentOp()) {
+    if (auto parallelOp = mlir::dyn_cast<mlir::omp::ParallelOp>(op))
+      return parallelOp;
+  }
+  return {};
+}
+
 static mlir::omp::SectionsOp
 genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               semantics::SemanticsContext &semaCtx,
@@ -2671,13 +2737,58 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
                            /*useDelayedPrivatization=*/false, symTable);
   dsp.processStep1();
 
+  // Detect conditional lastprivate symbols for sections.
+  auto &condLpSyms = dsp.getConditionalLastprivateSymbols();
+  fir::RecordType lpType;
+  mlir::Value lpAlloca;
+  if (!condLpSyms.empty()) {
+    lpType = buildConditionalLpType(converter, condLpSyms, loc);
+    mlir::omp::DeclareReductionOp declRedOp =
+        buildConditionalLastPrivateReduction(converter, lpType, condLpSyms);
+
+    // Create the struct alloca outside the parent parallel (if any).
+    // In the orphaned case (no enclosing ParallelOp), use a
+    // module-scope global so that all threads share one reduction target.
+    auto enclosingParallel = findEnclosingParallelOp(builder);
+    bool isOrphaned = !enclosingParallel;
+
+    // Guard against nested parallelism in the orphaned case.
+    // Emit this BEFORE touching the global to avoid racing on it.
+    if (isOrphaned)
+      emitNestedParallelGuardForCondLp(converter, loc);
+
+    if (!isOrphaned) {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.setInsertionPoint(enclosingParallel);
+      lpAlloca = builder.createTemporary(loc, lpType);
+      initConditionalLpStruct(builder, loc, lpType, lpAlloca);
+    } else {
+      lpAlloca = getOrCreateConditionalLpGlobal(converter, loc, lpType);
+      // The global is shared across all threads. Use omp.single (which
+      // has an implicit barrier at exit) so that exactly one thread
+      // initialises and all threads wait before entering the construct.
+      mlir::omp::SingleOperands initSingleOps;
+      auto singleOp = mlir::omp::SingleOp::create(builder, loc, initSingleOps);
+      mlir::Block *singleBlock = builder.createBlock(&singleOp.getRegion());
+      builder.setInsertionPointToStart(singleBlock);
+      initConditionalLpStruct(builder, loc, lpType, lpAlloca);
+      mlir::omp::TerminatorOp::create(builder, loc);
+      builder.setInsertionPointAfter(singleOp);
+    }
+
+    clauseOps.reductionVars.push_back(lpAlloca);
+    clauseOps.reductionByref.push_back(true);
+    clauseOps.reductionSyms.push_back(
+        mlir::SymbolRefAttr::get(builder.getContext(), declRedOp.getSymName()));
+    reductionObjects.push_back(Object{{nullptr, std::nullopt}});
+  }
+
   List<Clause> nonDsaClauses;
   List<const clause::Lastprivate *> lastprivates;
 
   for (const Clause &clause : item->clauses) {
     if (clause.id == llvm::omp::Clause::OMPC_lastprivate) {
       auto &lastp = std::get<clause::Lastprivate>(clause.u);
-      lastprivateModifierNotSupported(lastp, converter.getCurrentLocation());
       lastprivates.push_back(&lastp);
     } else {
       switch (clause.id) {
@@ -2732,6 +2843,26 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
         sectionQueue, sectionQueue.begin());
   }
 
+  // Capture original addresses and rewrite conditional LP assigns in sections.
+  llvm::MapVector<mlir::Value, std::string> condLpOrigAddrs;
+  if (!condLpSyms.empty()) {
+    for (const auto *sym : condLpSyms) {
+      mlir::Value addr = converter.getSymbolAddress(*sym);
+      if (addr)
+        condLpOrigAddrs[addr] = sym->name().ToString();
+    }
+    rewriteConditionalLpAssignsInSections(converter, sectionsOp, lpType,
+                                          condLpOrigAddrs, loc);
+  }
+
+  // Collect conditional LP symbol names so we can skip them in the normal
+  // lastprivate copy-back (they are handled by the reduction path).
+  llvm::SmallDenseSet<const semantics::Symbol *> condLpSymSet(
+      condLpSyms.begin(), condLpSyms.end());
+
+  // Track whether any non-conditional lastprivate copy-backs were emitted.
+  bool hasNonCondLastprivate = false;
+
   if (!lastprivates.empty()) {
     mlir::Region &sectionsBody = sectionsOp.getRegion();
     assert(sectionsBody.hasOneBlock());
@@ -2750,6 +2881,10 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
       const auto &objList = std::get<ObjectList>(lastp->t);
       for (const Object &object : objList) {
         semantics::Symbol *sym = object.sym();
+        // Skip conditional LP symbols — handled by the reduction path.
+        if (condLpSymSet.count(sym))
+          continue;
+        hasNonCondLastprivate = true;
         if (const auto *common =
                 sym->detailsIf<semantics::CommonBlockDetails>()) {
           for (const auto &obj : common->objects())
@@ -2764,12 +2899,45 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // Perform DataSharingProcessor's step2 out of SECTIONS
   builder.setInsertionPointAfter(sectionsOp.getOperation());
   dsp.processStep2(sectionsOp, false);
-  // Emit implicit barrier to synchronize threads and avoid data
-  // races on post-update of lastprivate variables when `nowait`
-  // clause is present.
-  if (clauseOps.nowait && !lastprivates.empty())
+  // Emit barrier when nowait is present and there are lastprivate copy-backs
+  // (either non-conditional or conditional).  The barrier ensures all threads
+  // have completed their work before lastprivate values are read/copied.
+  //
+  // NOTE: The LLVM OpenMP runtime currently imposes an implicit barrier
+  // inside __kmpc_reduce for tree reductions.  If the runtime were modified
+  // to release losing threads early when nowait is specified, we could use
+  // the return value from the tree reduction (case 1 = winner) to let the
+  // winner thread perform the copy-back without a separate barrier.
+  if (clauseOps.nowait && (hasNonCondLastprivate || !condLpSyms.empty()))
     mlir::omp::BarrierOp::create(builder, loc);
 
+  // Copy-back: copy winning values from the shared reduction struct to the
+  // original variables.  When nowait is absent, the worksharing construct's
+  // implicit end-barrier guarantees all reductions are combined before we
+  // reach this point.  When nowait is present, the barrier above ensures
+  // the reduction is fully finalized before reading the struct.
+  // Wrapped in omp.single so exactly one thread performs the stores.
+  if (!condLpSyms.empty()) {
+    mlir::omp::SingleOperands singleClauseOps;
+    auto singleOp = mlir::omp::SingleOp::create(builder, loc, singleClauseOps);
+    mlir::Block *singleBlock = builder.createBlock(&singleOp.getRegion());
+    builder.setInsertionPointToStart(singleBlock);
+
+    for (auto &[origAddr, symName] : condLpOrigAddrs) {
+      unsigned valFieldIdx = lpType.getFieldIndex(symName);
+      mlir::Type valType = lpType.getType(valFieldIdx);
+
+      fir::IntOrValue valFIdx =
+          mlir::IntegerAttr::get(builder.getI32Type(), valFieldIdx);
+      mlir::Value fieldAddr = fir::CoordinateOp::create(
+          builder, loc, builder.getRefType(valType), lpAlloca,
+          llvm::SmallVector<fir::IntOrValue, 1>{valFIdx});
+      mlir::Value val = fir::LoadOp::create(builder, loc, fieldAddr);
+      fir::StoreOp::create(builder, loc, val, origAddr);
+    }
+    mlir::omp::TerminatorOp::create(builder, loc);
+  }
+
   return sectionsOp;
 }
 
@@ -3361,6 +3529,154 @@ static mlir::omp::DistributeOp genStandaloneDistribute(
   return distributeOp;
 }
 
+/// Zero-initialize the value fields and set index fields to -1 in a
+/// conditional-lastprivate reduction struct.
+///
+/// The struct groups all value fields first, then all index fields:
+///   {val_0, val_1, ..., idx_0, idx_1, ...}
+/// so fields [0, numVars) are value fields and [numVars, 2*numVars) are
+/// the corresponding iteration index fields.
+///
+/// The -1 sentinel on index fields ensures the combiner's "sequentially
+/// last" comparison treats the slot as "no iteration has written yet"
+/// (any real canonical loop IV >= 0 beats -1).
+static void initConditionalLpStruct(fir::FirOpBuilder &builder,
+                                    mlir::Location loc,
+                                    fir::RecordType lpCondType,
+                                    mlir::Value structRef) {
+  auto fields = lpCondType.getTypeList();
+  unsigned numVars = fields.size() / 2;
+  for (unsigned i = 0, e = fields.size(); i < e; ++i) {
+    mlir::Type fieldTy = fields[i].second;
+    fir::IntOrValue idx = mlir::IntegerAttr::get(builder.getI32Type(), i);
+    mlir::Value fieldAddr = fir::CoordinateOp::create(
+        builder, loc, builder.getRefType(fieldTy), structRef,
+        llvm::SmallVector<fir::IntOrValue, 1>{idx});
+    mlir::Value initVal;
+    if (i >= numVars) // index field (second half)
+      initVal = builder.createIntegerConstant(loc, fieldTy, -1);
+    else // value field (first half)
+      initVal = fir::factory::createZeroValue(builder, loc, fieldTy);
+    fir::StoreOp::create(builder, loc, initVal, fieldAddr);
+  }
+}
+
+/// Emit a runtime guard for orphaned conditional-lastprivate worksharing
+/// constructs.  The module-scope global used for the reduction struct is
+/// shared across all teams, so concurrent nested teams would race on it.
+/// Clang has a similar limitation for conditional lastprivate due to its
+/// use of a shared global variable.
+///
+/// Emits:  if (omp_get_level() > 1) ERROR STOP "<message>"
+static void
+emitNestedParallelGuardForCondLp(lower::AbstractConverter &converter,
+                                 mlir::Location loc) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::MLIRContext *ctx = builder.getContext();
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // Declare omp_get_level_() -> i32 if not already present.
+  auto funcTy = mlir::FunctionType::get(ctx, {}, {i32Ty});
+  if (!builder.getNamedFunction("omp_get_level_"))
+    builder.createFunction(loc, "omp_get_level_", funcTy);
+
+  mlir::Value level =
+      fir::CallOp::create(builder, loc,
+                          builder.getNamedFunction("omp_get_level_"),
+                          mlir::ValueRange{})
+          .getResult(0);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  mlir::Value isNested = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::sgt, level, one);
+
+  auto ifOp = fir::IfOp::create(builder, loc, /*resultTypes=*/{}, isNested,
+                                /*withElse=*/false);
+  builder.setInsertionPoint(ifOp.getThenRegion().front().getTerminator());
+
+  // Build a global string constant for the error message.
+  llvm::StringRef msg =
+      "orphaned worksharing construct with lastprivate(conditional:) "
+      "is not supported in nested parallelism";
+  std::string globalName = "_lp_cond_nested_msg";
+  size_t msgLen = msg.size();
+  auto charTy = fir::CharacterType::get(ctx, 1, msgLen);
+  if (!builder.getNamedGlobal(globalName)) {
+    fir::GlobalOp global = builder.createGlobal(
+        loc, charTy, globalName, builder.createInternalLinkage(),
+        /*value=*/mlir::Attribute{}, /*isConst=*/true);
+    mlir::Region &region = global.getRegion();
+    mlir::Block *block = builder.createBlock(&region);
+    builder.setInsertionPointToStart(block);
+    mlir::Value val = fir::StringLitOp::create(builder, loc, charTy, msg);
+    fir::HasValueOp::create(builder, loc, val);
+    builder.setInsertionPoint(ifOp.getThenRegion().front().getTerminator());
+  }
+
+  // Declare _FortranAStopStatementText if not already present.
+  mlir::Type i64Ty = builder.getI64Type();
+  mlir::Type i1Ty = builder.getI1Type();
+  mlir::Type ptrTy = builder.getRefType(builder.getIntegerType(8));
+  auto stopTy = mlir::FunctionType::get(ctx, {ptrTy, i64Ty, i1Ty, i1Ty}, {});
+  if (!builder.getNamedFunction("_FortranAStopStatementText"))
+    builder.createFunction(loc, "_FortranAStopStatementText", stopTy);
+
+  mlir::Value msgAddr =
+      fir::AddrOfOp::create(builder, loc, builder.getRefType(charTy),
+                            builder.getSymbolRefAttr(globalName));
+  mlir::Value msgPtr = builder.createConvert(loc, ptrTy, msgAddr);
+  mlir::Value len = builder.createIntegerConstant(loc, i64Ty, msgLen);
+  mlir::Value trueVal = builder.createIntegerConstant(loc, i1Ty, 1);
+  mlir::Value falseVal = builder.createIntegerConstant(loc, i1Ty, 0);
+  fir::CallOp::create(builder, loc,
+                      builder.getNamedFunction("_FortranAStopStatementText"),
+                      mlir::ValueRange{msgPtr, len, trueVal, falseVal});
+
+  builder.setInsertionPointAfter(ifOp);
+}
+
+/// Return the address of a module-scope global for the conditional-lastprivate
+/// reduction struct.  This is used in the *orphaned* worksharing case (sections
+/// or wsloop inside a subroutine called from a parallel region) where the
+/// parent op is a FuncOp, not a ParallelOp.
+///
+/// Because there is no enclosing omp.parallel in the same function, a stack
+/// alloca would give every thread its own private copy and the cross-thread
+/// reduction combine would never merge results.  A global provides a single
+/// shared address that all th...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/200086