[flang-commits] [flang] [mlir] [Flang][OpenMP] Implement workdistribute construct lowering (PR #140523)
Ivan R. Ivanov via flang-commits
flang-commits at lists.llvm.org
Thu Oct 9 02:00:35 PDT 2025
================
@@ -0,0 +1,1817 @@
+//===- LowerWorkdistribute.cpp
+//-------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering and optimisations of omp.workdistribute.
+//
+// Fortran array statements are lowered to fir as fir.do_loop unordered.
+// lower-workdistribute pass works mainly on identifying fir.do_loop unordered
+// that is nested in target{teams{workdistribute{fir.do_loop unordered}}} and
+// lowers it to target{teams{parallel{distribute{wsloop{loop_nest}}}}}.
+// It hoists all the other ops outside target region.
+// Relaces heap allocation on target with omp.target_allocmem and
+// deallocation with omp.target_freemem from host. Also replaces
+// runtime function "Assign" with omp_target_memcpy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/HLFIR/Passes.h"
+#include "flang/Optimizer/OpenMP/Utils.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include <mlir/Dialect/Arith/IR/Arith.h>
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
+#include <mlir/Dialect/Utils/IndexingUtils.h>
+#include <mlir/IR/BlockSupport.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/IRMapping.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+#include <mlir/Support/LLVM.h>
+#include <optional>
+#include <variant>
+
+namespace flangomp {
+#define GEN_PASS_DEF_LOWERWORKDISTRIBUTE
+#include "flang/Optimizer/OpenMP/Passes.h.inc"
+} // namespace flangomp
+
+#define DEBUG_TYPE "lower-workdistribute"
+
+using namespace mlir;
+
+namespace {
+
+// The isRuntimeCall function is a utility designed to determine
+// if a given operation is a call to a Fortran-specific runtime function.
+static bool isRuntimeCall(Operation *op) {
+ if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+ auto callee = callOp.getCallee();
+ if (!callee)
+ return false;
+ auto *func = op->getParentOfType<ModuleOp>().lookupSymbol(*callee);
+ if (func->getAttr(fir::FIROpsDialect::getFirRuntimeAttrName()))
+ return true;
+ }
+ return false;
+}
+
+// This is the single source of truth about whether we should parallelize an
+// operation nested in an omp.workdistribute region.
+static bool shouldParallelize(Operation *op) {
+ // True if the op is a runtime call to Assign
+ if (isRuntimeCall(op)) {
+ fir::CallOp runtimeCall = cast<fir::CallOp>(op);
+ if ((*runtimeCall.getCallee()).getRootReference().getValue() ==
+ "_FortranAAssign") {
+ return true;
+ }
+ }
+ // We cannot parallelize ops with side effects.
+ // Parallelizable operations should not produce
+ // values that other operations depend on
+ if (llvm::any_of(op->getResults(),
+ [](OpResult v) -> bool { return !v.use_empty(); }))
+ return false;
+ // We will parallelize unordered loops - these come from array syntax
+ if (auto loop = dyn_cast<fir::DoLoopOp>(op)) {
+ auto unordered = loop.getUnordered();
+ if (!unordered)
+ return false;
+ return *unordered;
+ }
+ // We cannot parallise anything else.
+ return false;
+}
+
+// The getPerfectlyNested function is a generic utility for finding
+// a single, "perfectly nested" operation within a parent operation.
+template <typename T>
+static T getPerfectlyNested(Operation *op) {
+ if (op->getNumRegions() != 1)
+ return nullptr;
+ auto ®ion = op->getRegion(0);
+ if (region.getBlocks().size() != 1)
+ return nullptr;
+ auto *block = ®ion.front();
+ auto *firstOp = &block->front();
+ if (auto nested = dyn_cast<T>(firstOp))
+ if (firstOp->getNextNode() == block->getTerminator())
+ return nested;
+ return nullptr;
+}
+
+// VerifyTargetTeamsWorkdistribute method verifies that
+// omp.target { teams { workdistribute { ... } } } is well formed
+// and fails for function calls that don't have lowering implemented yet.
+static bool
+VerifyTargetTeamsWorkdistribute(omp::WorkdistributeOp workdistribute) {
+ OpBuilder rewriter(workdistribute);
+ auto teams = dyn_cast<omp::TeamsOp>(workdistribute->getParentOp());
+ if (!teams) {
+ workdistribute.emitError() << "workdistribute not nested in teams\n";
+ return false;
+ }
+ if (workdistribute.getRegion().getBlocks().size() != 1) {
+ workdistribute.emitError() << "workdistribute with multiple blocks\n";
+ return false;
+ }
+ if (teams.getRegion().getBlocks().size() != 1) {
+ workdistribute.emitError() << "teams with multiple blocks\n";
+ return false;
+ }
+ omp::TargetOp targetOp = dyn_cast<omp::TargetOp>(teams->getParentOp());
+ // return if not omp.target
+ if (!targetOp)
+ return true;
+
+ for (auto &op : workdistribute.getOps()) {
+ if (auto callOp = dyn_cast<fir::CallOp>(op)) {
+ if (isRuntimeCall(&op)) {
+ auto funcName = (*callOp.getCallee()).getRootReference().getValue();
+ // _FortranAAssign is handled. Other runtime calls are not supported
+ // in omp.workdistribute yet.
+ if (funcName == "_FortranAAssign")
+ continue;
+ else
+ workdistribute.emitError()
+ << "Runtime call " << funcName
+ << " lowering not supported for workdistribute yet.";
+ return false;
+ } else {
+ workdistribute.emitError() << "Non-runtime fir.call lowering not "
+ "supported in workdistribute yet.";
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+// FissionWorkdistribute method finds the parallelizable ops
+// within teams {workdistribute} region and moves them to their
+// own teams{workdistribute} region.
+//
+// If B() and D() are parallelizable,
+//
+// omp.teams {
+// omp.workdistribute {
+// A()
+// B()
+// C()
+// D()
+// E()
+// }
+// }
+//
+// becomes
+//
+// A()
+// omp.teams {
+// omp.workdistribute {
+// B()
+// }
+// }
+// C()
+// omp.teams {
+// omp.workdistribute {
+// D()
+// }
+// }
+// E()
----------------
ivanradanov wrote:
It is necessary for correctness because there is no target-wide barrier construct in OpenMP (or on every GPU we want to target) and the only way to achieve that is (I think) to launch separate kernels.
https://github.com/llvm/llvm-project/pull/140523
More information about the flang-commits
mailing list