[flang-commits] [flang] [WIP][flang][OpenMP] Experimental pass to map `do concurrent` to OMP (PR #77285)
Kareem Ergawy via flang-commits
flang-commits at lists.llvm.org
Fri Feb 16 01:32:23 PST 2024
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/77285
>From 44195146b1fa01952078766a2262c848ee23f27c Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 27 Dec 2023 07:25:46 -0600
Subject: [PATCH 1/2] [WIP][flang][OpenMP] Experimental pass to map `do
concurrent` to OMP
Adds a pass to map `do concurrent` to OpenMP worksharing consturcts. For
now, only maps basic loops to `omp parallel do`. This is still a WIP
with more work needed for testing and mapping more advanced loops.
---
.../include/flang/Optimizer/HLFIR/HLFIROps.td | 2 +-
.../flang/Optimizer/Transforms/Passes.h | 2 +
.../flang/Optimizer/Transforms/Passes.td | 20 +++
flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 +
.../Transforms/DoConcurrentConversion.cpp | 162 ++++++++++++++++++
flang/test/Transforms/DoConcurrent/basic.mlir | 60 +++++++
6 files changed, 246 insertions(+), 1 deletion(-)
create mode 100644 flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
create mode 100644 flang/test/Transforms/DoConcurrent/basic.mlir
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index c82eae154d31a1..4aee66f83b4931 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -75,7 +75,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
func.func @foo(%arg0: !fir.ref<!fir.array<?x?x!fir.char<1,?>>>, %arg1: !fir.ref<i64>) {
%c10 = arith.constant 10 : index
%c20 = arith.constant 20 : index
- %1 = fir.load %ag1 : fir.ref<i64>
+ %1 = fir.load %arg1 : fir.ref<i64>
%2 = fir.shape_shift %c10, %1, %c20, %1 : (index, index, index, index) -> !fir.shapeshift<2>
%3 = hfir.declare %arg0(%2) typeparams %1 {uniq_name = "c"} (fir.ref<!fir.array<?x?x!fir.char<1,?>>>, fir.shapeshift<2>, index) -> (fir.box<!fir.array<?x?x!fir.char<1,?>>>, fir.ref<!fir.array<?x?x!fir.char<1,?>>>)
// ... uses %3#0 as "c"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index e1d22c8c986da7..25a526ab0cbfcb 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -96,6 +96,8 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath,
bool noNaNsFPMath, bool approxFuncFPMath,
bool noSignedZerosFPMath, bool unsafeFPMath);
+std::unique_ptr<mlir::Pass> createDoConcurrentConversionPass();
+
// declarative passes
#define GEN_PASS_REGISTRATION
#include "flang/Optimizer/Transforms/Passes.h.inc"
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 5fb576fd876254..06de4a1d28a929 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -397,4 +397,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
let constructor = "::fir::createFunctionAttrPass()";
}
+def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> {
+ let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops.";
+
+ let description = [{ This is an experimental pass to map `DO CONCURRENR` loops
+ to their correspnding equivalent OpenMP worksharing constructs.
+
+ For now the following is supported:
+ - Mapping simple loops to `parallel do`.
+
+ Still to TODO:
+ - More extensive testing.
+ - Mapping to `target teams distribute parallel do`.
+ - Allowing the user to control mapping behavior: either to the host or
+ target.
+ }];
+
+ let constructor = "::fir::createDoConcurrentConversionPass()";
+ let dependentDialects = ["mlir::omp::OpenMPDialect"];
+}
+
#endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index ba2e267996150e..cf83bb496bb5e8 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -22,6 +22,7 @@ add_flang_library(FIRTransforms
OMPMarkDeclareTarget.cpp
VScaleAttr.cpp
FunctionAttr.cpp
+ DoConcurrentConversion.cpp
DEPENDS
FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
new file mode 100644
index 00000000000000..180c0bdf672af9
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -0,0 +1,162 @@
+//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Dialect/Support/FIRContext.h"
+#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#include <memory>
+
+namespace fir {
+#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"
+
+namespace {
+class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
+public:
+ using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;
+
+ mlir::LogicalResult
+ matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
+ mlir::ConversionPatternRewriter &rewriter) const override {
+ mlir::OpPrintingFlags flags;
+ flags.printGenericOpForm();
+
+ mlir::omp::ParallelOp parallelOp =
+ rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc());
+
+ rewriter.createBlock(¶llelOp.getRegion());
+ mlir::Block &block = parallelOp.getRegion().back();
+
+ rewriter.setInsertionPointToEnd(&block);
+ rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc());
+
+ rewriter.setInsertionPointToStart(&block);
+
+ // Clone the LB, UB, step defining ops inside the parallel region.
+ llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
+ lowerBound.push_back(
+ rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0));
+ upperBound.push_back(
+ rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
+ step.push_back(
+ rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
+
+ auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
+ doLoop.getLoc(), lowerBound, upperBound, step);
+ wsLoopOp.setInclusive(true);
+
+ auto outlineableOp =
+ mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp);
+ assert(outlineableOp);
+ rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock());
+
+ // For the induction variable, we need to privative its allocation and
+ // binding inside the parallel region.
+ llvm::SmallSetVector<mlir::Operation *, 2> workList;
+ // Therefore, we first discover the induction variable by discovering
+ // `fir.store`s where the source is the loop's block argument.
+ workList.insert(doLoop.getInductionVar().getUsers().begin(),
+ doLoop.getInductionVar().getUsers().end());
+ llvm::SmallSetVector<fir::StoreOp, 2> inductionVarTargetStores;
+
+ // Walk the def-chain of the loop's block argument until we hit `fir.store`.
+ while (!workList.empty()) {
+ mlir::Operation *item = workList.front();
+
+ if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(item)) {
+ inductionVarTargetStores.insert(storeOp);
+ } else {
+ workList.insert(item->getUsers().begin(), item->getUsers().end());
+ }
+
+ workList.remove(item);
+ }
+
+ // For each collected `fir.sotre`, find the target memref's alloca's and
+ // declare ops.
+ llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone;
+ for (auto storeOp : inductionVarTargetStores) {
+ mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp();
+
+ for (auto operand : storeTarget->getOperands()) {
+ declareAndAllocasToClone.insert(operand.getDefiningOp());
+ }
+ declareAndAllocasToClone.insert(storeTarget);
+ }
+
+ mlir::IRMapping mapper;
+
+ // Collect the memref defining ops in the parallel region.
+ for (mlir::Operation *opToClone : declareAndAllocasToClone) {
+ rewriter.clone(*opToClone, mapper);
+ }
+
+ // Clone the loop's body inside the worksharing construct using the mapped
+ // memref values.
+ rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(),
+ wsLoopOp.getRegion().begin(), mapper);
+
+ mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator();
+ rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back());
+ rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
+ rewriter.eraseOp(terminator);
+
+ rewriter.eraseOp(doLoop);
+
+ return mlir::success();
+ }
+};
+
+class DoConcurrentConversionPass
+ : public fir::impl::DoConcurrentConversionPassBase<
+ DoConcurrentConversionPass> {
+public:
+ void runOnOperation() override {
+ mlir::func::FuncOp func = getOperation();
+
+ if (func.isDeclaration()) {
+ return;
+ }
+
+ auto *context = &getContext();
+ mlir::RewritePatternSet patterns(context);
+ patterns.insert<DoConcurrentConversion>(context);
+ mlir::ConversionTarget target(*context);
+ target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
+ mlir::arith::ArithDialect, mlir::func::FuncDialect,
+ mlir::omp::OpenMPDialect>();
+
+ target.addDynamicallyLegalOp<fir::DoLoopOp>(
+ [](fir::DoLoopOp op) { return !op.getUnordered(); });
+
+ if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
+ std::move(patterns)))) {
+ mlir::emitError(mlir::UnknownLoc::get(context),
+ "error in converting do-concurrent op");
+ signalPassFailure();
+ }
+ }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> fir::createDoConcurrentConversionPass() {
+ return std::make_unique<DoConcurrentConversionPass>();
+}
diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic.mlir
new file mode 100644
index 00000000000000..7d62463f36d422
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/basic.mlir
@@ -0,0 +1,60 @@
+// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`.
+
+// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s
+
+// CHECK-LABEL: func.func @do_concurrent_basic
+func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} {
+ // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+ // CHECK: %[[C1:.*]] = arith.constant 1 : i32
+ // CHECK: %[[C10:.*]] = arith.constant 10 : i32
+
+ %0 = fir.alloca i32 {bindc_name = "i"}
+ %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+ %2 = fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+ %c10 = arith.constant 10 : index
+ %3 = fir.shape %c10 : (index) -> !fir.shape<1>
+ %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+ %c1_i32 = arith.constant 1 : i32
+ %7 = fir.convert %c1_i32 : (i32) -> index
+ %c10_i32 = arith.constant 10 : i32
+ %8 = fir.convert %c10_i32 : (i32) -> index
+ %c1 = arith.constant 1 : index
+
+ // CHECK-NOT: fir.do_loop
+
+ // CHECK: omp.parallel {
+
+ // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
+ // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+ // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index
+ // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index
+ // CHECK: %[[STEP:.*]] = arith.constant 1 : index
+
+ // CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+ // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32
+ // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref<i32>
+ // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+ // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref<i32>
+ // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64
+ // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref<i32>
+ // CHECK-NEXT: omp.yield
+ // CHECK-NEXT: }
+
+ // CHECK-NEXT: omp.terminator
+ // CHECK-NEXT: }
+ fir.do_loop %arg0 = %7 to %8 step %c1 unordered {
+ %13 = fir.convert %arg0 : (index) -> i32
+ fir.store %13 to %1#1 : !fir.ref<i32>
+ %14 = fir.load %1#0 : !fir.ref<i32>
+ %15 = fir.load %1#0 : !fir.ref<i32>
+ %16 = fir.convert %15 : (i32) -> i64
+ %17 = hlfir.designate %4#0 (%16) : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+ hlfir.assign %14 to %17 : i32, !fir.ref<i32>
+ }
+
+ // CHECK-NOT: fir.do_loop
+
+ return
+ }
>From 07d6c48e6d610e1c73aeb8f7f1c7dafbb3d5d3f0 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 16 Feb 2024 03:30:58 -0600
Subject: [PATCH 2/2] Details on some of the analyses we might need.
This does not detail everything yet, still working through this.
---
.../Transforms/DoConcurrentConversion.cpp | 53 +++++++++++++++++--
1 file changed, 48 insertions(+), 5 deletions(-)
diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
index 180c0bdf672af9..e7b223aec8ea2a 100644
--- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -42,13 +42,32 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
mlir::omp::ParallelOp parallelOp =
rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc());
- rewriter.createBlock(¶llelOp.getRegion());
- mlir::Block &block = parallelOp.getRegion().back();
+ mlir::Block *block = rewriter.createBlock(¶llelOp.getRegion());
- rewriter.setInsertionPointToEnd(&block);
+ rewriter.setInsertionPointToEnd(block);
rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc());
- rewriter.setInsertionPointToStart(&block);
+ rewriter.setInsertionPointToStart(block);
+
+ // ==== TODO (1) Start ====
+ //
+ // The goal of the few lines below is to collect and clone
+ // the list of operations that define the loop's lower and upper bounds as
+ // well as the step. Should we, instead of doing this here, split it into 2
+ // stages?
+ //
+ // 1. **Stage 1**: add an analysis that extracts all the relevant
+ // operations defining the lower-bound, upper-bound, and
+ // step.
+ // 2. **Stage 2**: clone the collected operations in the parallel region.
+ //
+ // So far, the pass has been tested with very simple loops (where the bounds
+ // and step are constants) so the goal of **Stage 1** is to have a
+ // well-defined component that has the sole responsibility of collecting all
+ // the relevant ops relevant to the loop header. This was we can test this
+ // in isolation for more complex loops and better organize the code. **Stage
+ // 2** would then be responsible for the actual cloning of the collected
+ // loop header preparation/allocation operations.
// Clone the LB, UB, step defining ops inside the parallel region.
llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
@@ -58,6 +77,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
step.push_back(
rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
+ // ==== TODO (1) End ====
auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
doLoop.getLoc(), lowerBound, upperBound, step);
@@ -65,9 +85,26 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
auto outlineableOp =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp);
- assert(outlineableOp);
rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock());
+ // ==== TODO (2) Start ====
+ //
+ // The goal of the following simple work-list algorithm and
+ // the following `for` loop is to collect all the operations related to the
+ // allocation of the induction variable for the `do concurrent` loop. The
+ // operations collected by this algorithm are very similar to what is
+ // usually emitted for privatized variables, e.g. for omp.parallel loops.
+ // Therefore, I think we can:
+ //
+ // 1. **Stage 1**: Add an analysis that colects all these operations. The
+ // goal is similar to **Stage 1** of TODO (1): isolate the
+ // algorithm is an individually-testable component so that
+ // we properly implement and test it for more complicated
+ // `do concurrent` loops.
+ // 1. **Stage 2**: Using the collected operations, create and populate an
+ // `omp.private {type=private}` op to server as the
+ // delayed privatizer for the new work-sharing loop.
+
// For the induction variable, we need to privative its allocation and
// binding inside the parallel region.
llvm::SmallSetVector<mlir::Operation *, 2> workList;
@@ -101,6 +138,12 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
}
declareAndAllocasToClone.insert(storeTarget);
}
+ // ==== TODO (2) End ====
+ //
+ // TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can
+ // more easily generalize the pass to work for targets other than OpenMP,
+ // e.g. OpenACC, I think can, can reuse the results of the analyses and only
+ // change the code-gen/rewriting.
mlir::IRMapping mapper;
More information about the flang-commits
mailing list