[flang-commits] [flang] [mlir] [Flang][OpenMP] Implement workdistribute construct lowering (PR #140523)

Thu Oct 9 02:00:35 PDT 2025

================
@@ -0,0 +1,1817 @@
+//===- LowerWorkdistribute.cpp
+//-------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering and optimisations of omp.workdistribute.
+//
+// Fortran array statements are lowered to fir as fir.do_loop unordered.
+// lower-workdistribute pass works mainly on identifying fir.do_loop unordered
+// that is nested in target{teams{workdistribute{fir.do_loop unordered}}} and
+// lowers it to target{teams{parallel{distribute{wsloop{loop_nest}}}}}.
+// It hoists all the other ops outside target region.
+// Relaces heap allocation on target with omp.target_allocmem and
+// deallocation with omp.target_freemem from host. Also replaces
+// runtime function "Assign" with omp_target_memcpy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Utils.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
+#include <mlir/Dialect/Utils/IndexingUtils.h>
+#include <mlir/IR/BlockSupport.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/IRMapping.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Support/LLVM.h>
+#include <optional>
+#include <variant>
+
+namespace flangomp {
+#define GEN_PASS_DEF_LOWERWORKDISTRIBUTE
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+#define DEBUG_TYPE "lower-workdistribute"
+
+using namespace mlir;
+
+namespace {
+
+// The isRuntimeCall function is a utility designed to determine
+// if a given operation is a call to a Fortran-specific runtime function.
+static bool isRuntimeCall(Operation *op) {
+  if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+    auto callee = callOp.getCallee();
+    if (!callee)
+      return false;
+    auto *func = op->getParentOfType<ModuleOp>().lookupSymbol(*callee);
+    if (func->getAttr(fir::FIROpsDialect::getFirRuntimeAttrName()))
+      return true;
+  }
+  return false;
+}
+
+// This is the single source of truth about whether we should parallelize an
+// operation nested in an omp.workdistribute region.
+static bool shouldParallelize(Operation *op) {
+  // True if the op is a runtime call to Assign
+  if (isRuntimeCall(op)) {
+    fir::CallOp runtimeCall = cast<fir::CallOp>(op);
+    if ((*runtimeCall.getCallee()).getRootReference().getValue() ==
+        "_FortranAAssign") {
+      return true;
+    }
+  }
+  // We cannot parallelize ops with side effects.
+  // Parallelizable operations should not produce
+  // values that other operations depend on
+  if (llvm::any_of(op->getResults(),
+                   [](OpResult v) -> bool { return !v.use_empty(); }))
+    return false;
+  // We will parallelize unordered loops - these come from array syntax
+  if (auto loop = dyn_cast<fir::DoLoopOp>(op)) {
+    auto unordered = loop.getUnordered();
+    if (!unordered)
+      return false;
+    return *unordered;
+  }
+  // We cannot parallise anything else.
+  return false;
+}
+
+// The getPerfectlyNested function is a generic utility for finding
+// a single, "perfectly nested" operation within a parent operation.
+template <typename T>
+static T getPerfectlyNested(Operation *op) {
+  if (op->getNumRegions() != 1)
+    return nullptr;
+  auto &region = op->getRegion(0);
+  if (region.getBlocks().size() != 1)
+    return nullptr;
+  auto *block = &region.front();
+  auto *firstOp = &block->front();
+  if (auto nested = dyn_cast<T>(firstOp))
+    if (firstOp->getNextNode() == block->getTerminator())
+      return nested;
+  return nullptr;
+}
+
+// VerifyTargetTeamsWorkdistribute method verifies that
+// omp.target { teams { workdistribute { ... } } } is well formed
+// and fails for function calls that don't have lowering implemented yet.
+static bool
+VerifyTargetTeamsWorkdistribute(omp::WorkdistributeOp workdistribute) {
+  OpBuilder rewriter(workdistribute);
+  auto teams = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp());
+  if (!teams) {
+    workdistribute.emitError() << "workdistribute not nested in teams\n";
+    return false;
+  }
+  if (workdistribute.getRegion().getBlocks().size() != 1) {
+    workdistribute.emitError() << "workdistribute with multiple blocks\n";
+    return false;
+  }
+  if (teams.getRegion().getBlocks().size() != 1) {
+    workdistribute.emitError() << "teams with multiple blocks\n";
+    return false;
+  }
+  omp::TargetOp targetOp = dyn_cast<omp::TargetOp>(teams->getParentOp());
+  // return if not omp.target
+  if (!targetOp)
+    return true;
+
+  for (auto &op : workdistribute.getOps()) {
+    if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+      if (isRuntimeCall(&op)) {
+        auto funcName = (*callOp.getCallee()).getRootReference().getValue();
+        // _FortranAAssign is handled. Other runtime calls are not supported
+        // in omp.workdistribute yet.
+        if (funcName == "_FortranAAssign")
+          continue;
+        else
+          workdistribute.emitError()
+              << "Runtime call " << funcName
+              << " lowering not supported for workdistribute yet.";
+        return false;
+      } else {
+        workdistribute.emitError() << "Non-runtime fir.call lowering not "
+                                      "supported in workdistribute yet.";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// FissionWorkdistribute method finds the parallelizable ops
+// within teams {workdistribute} region and moves them to their
+// own teams{workdistribute} region.
+//
+// If B() and D() are parallelizable,
+//
+// omp.teams {
+//   omp.workdistribute {
+//     A()
+//     B()
+//     C()
+//     D()
+//     E()
+//   }
+// }
+//
+// becomes
+//
+// A()
+// omp.teams {
+//   omp.workdistribute {
+//     B()
+//   }
+// }
+// C()
+// omp.teams {
+//   omp.workdistribute {
+//     D()
+//   }
+// }
+// E()
----------------
ivanradanov wrote:

It is necessary for correctness because there is no target-wide barrier construct in OpenMP (or on every GPU we want to target) and the only way to achieve that is (I think) to launch separate kernels.


https://github.com/llvm/llvm-project/pull/140523