[flang] [llvm] [mlir] [openmp] [MLIR][OpenMP] Add omp.fuse operation (PR #168898)
Ferran Toda via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 24 04:51:10 PST 2025
https://github.com/NouTimbaler updated https://github.com/llvm/llvm-project/pull/168898
>From b24a7a72598d303a2e3a2ca4f61a7b1a0a744fa4 Mon Sep 17 00:00:00 2001
From: Ferran Toda <ferran.todacasaban at bsc.es>
Date: Thu, 20 Nov 2025 02:45:10 +0000
Subject: [PATCH] lower loop fuse
---
flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 1 +
flang/lib/Lower/OpenMP/Clauses.cpp | 5 +-
.../lib/Lower/OpenMP/DataSharingProcessor.cpp | 3 +-
flang/lib/Lower/OpenMP/OpenMP.cpp | 77 ++++++++--
flang/lib/Lower/OpenMP/Utils.cpp | 28 ++--
flang/lib/Lower/OpenMP/Utils.h | 6 +-
flang/test/Lower/OpenMP/fuse01.f90 | 93 ++++++++++++
flang/test/Lower/OpenMP/fuse02.f90 | 123 +++++++++++++++
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 53 +++++++
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 111 ++++++++++++++
mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 34 +++++
mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 68 +++++++++
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 54 +++++++
mlir/test/Dialect/OpenMP/cli-fuse.mlir | 114 ++++++++++++++
mlir/test/Dialect/OpenMP/invalid-fuse.mlir | 100 +++++++++++++
.../test/Target/LLVMIR/openmp-cli-fuse01.mlir | 100 +++++++++++++
.../test/Target/LLVMIR/openmp-cli-fuse02.mlir | 140 ++++++++++++++++++
.../test/transform/fuse/do-looprange.f90 | 60 ++++++++
openmp/runtime/test/transform/fuse/do.f90 | 52 +++++++
19 files changed, 1194 insertions(+), 28 deletions(-)
create mode 100644 flang/test/Lower/OpenMP/fuse01.f90
create mode 100644 flang/test/Lower/OpenMP/fuse02.f90
create mode 100644 mlir/test/Dialect/OpenMP/cli-fuse.mlir
create mode 100644 mlir/test/Dialect/OpenMP/invalid-fuse.mlir
create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir
create mode 100644 mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir
create mode 100644 openmp/runtime/test/transform/fuse/do-looprange.f90
create mode 100644 openmp/runtime/test/transform/fuse/do.f90
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 4a392381287d5..ab3a174c7ad69 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -279,6 +279,7 @@ bool ClauseProcessor::processCollapse(
llvm::SmallVectorImpl<const semantics::Symbol *> &iv) const {
int64_t numCollapse = collectLoopRelatedInfo(converter, currentLocation, eval,
+ eval.getFirstNestedEvaluation(),
clauses, loopResult, iv);
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
collapseResult.collapseNumLoops = firOpBuilder.getI64IntegerAttr(numCollapse);
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index b1a3c3d3c5439..f2defc62dce91 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -1063,7 +1063,10 @@ Link make(const parser::OmpClause::Link &inp,
LoopRange make(const parser::OmpClause::Looprange &inp,
semantics::SemanticsContext &semaCtx) {
- llvm_unreachable("Unimplemented: looprange");
+ auto &t0 = std::get<0>(inp.v.t);
+ auto &t1 = std::get<1>(inp.v.t);
+ return LoopRange{{/*First*/ makeExpr(t0, semaCtx),
+ /*Count*/ makeExpr(t1, semaCtx)}};
}
Map make(const parser::OmpClause::Map &inp,
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 83c2eda0a2dc7..da9480123513f 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -347,7 +347,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
mlir::omp::LoopRelatedClauseOps result;
llvm::SmallVector<const semantics::Symbol *> iv;
collectLoopRelatedInfo(converter, converter.getCurrentLocation(), eval,
- clauses, result, iv);
+ eval.getFirstNestedEvaluation(), clauses, result,
+ iv);
// Update the original variable just before exiting the worksharing
// loop. Conversion as follows:
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index b11a1a14db066..5a31443f4eeee 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1982,9 +1982,9 @@ genLoopOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
static void genCanonicalLoopNest(
lower::AbstractConverter &converter, lower::SymMap &symTable,
semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
- mlir::Location loc, const ConstructQueue &queue,
- ConstructQueue::const_iterator item, size_t numLoops,
- llvm::SmallVectorImpl<mlir::omp::CanonicalLoopOp> &loops) {
+ lower::pft::Evaluation &nestedEval, mlir::Location loc,
+ const ConstructQueue &queue, ConstructQueue::const_iterator item,
+ size_t numLoops, llvm::SmallVectorImpl<mlir::omp::CanonicalLoopOp> &loops) {
assert(loops.empty() && "Expecting empty list to fill");
assert(numLoops >= 1 && "Expecting at least one loop");
@@ -1992,7 +1992,8 @@ static void genCanonicalLoopNest(
mlir::omp::LoopRelatedClauseOps loopInfo;
llvm::SmallVector<const semantics::Symbol *, 3> ivs;
- collectLoopRelatedInfo(converter, loc, eval, numLoops, loopInfo, ivs);
+ collectLoopRelatedInfo(converter, loc, eval, nestedEval, numLoops, loopInfo,
+ ivs);
assert(ivs.size() == numLoops &&
"Expected to parse as many loop variables as there are loops");
@@ -2014,7 +2015,7 @@ static void genCanonicalLoopNest(
// Step 1: Loop prologues
// Computing the trip count must happen before entering the outermost loop
- lower::pft::Evaluation *innermostEval = &eval.getFirstNestedEvaluation();
+ lower::pft::Evaluation *innermostEval = &nestedEval;
for ([[maybe_unused]] auto iv : ivs) {
if (innermostEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
// OpenMP specifies DO CONCURRENT only with the `!omp loop` construct.
@@ -2186,7 +2187,8 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
llvm::SmallVector<mlir::omp::CanonicalLoopOp, 3> canonLoops;
canonLoops.reserve(numLoops);
- genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item,
+ genCanonicalLoopNest(converter, symTable, semaCtx, eval,
+ eval.getFirstNestedEvaluation(), loc, queue, item,
numLoops, canonLoops);
assert((canonLoops.size() == numLoops) &&
"Expecting the predetermined number of loops");
@@ -2217,6 +2219,58 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
sizesClause.sizes);
}
+static void genFuseOp(Fortran::lower::AbstractConverter &converter,
+ Fortran::lower::SymMap &symTable,
+ lower::StatementContext &stmtCtx,
+ Fortran::semantics::SemanticsContext &semaCtx,
+ Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+ const ConstructQueue &queue,
+ ConstructQueue::const_iterator item) {
+ fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+ int32_t first = 0;
+ int32_t count = 0;
+ auto iter = llvm::find_if(item->clauses, [](const Clause &clause) {
+ return clause.id == llvm::omp::Clause::OMPC_looprange;
+ });
+ if (iter != item->clauses.end()) {
+ const auto &looprange = std::get<clause::LoopRange>(iter->u);
+ first = evaluate::ToInt64(std::get<0>(looprange.t)).value();
+ count = evaluate::ToInt64(std::get<1>(looprange.t)).value();
+ }
+
+ llvm::SmallVector<mlir::Value> applyees;
+ for (auto &child : eval.getNestedEvaluations()) {
+ // Skip OmpEndLoopDirective
+ if (&child == &eval.getLastNestedEvaluation())
+ break;
+
+ // Emit the associated loop
+ llvm::SmallVector<mlir::omp::CanonicalLoopOp> canonLoops;
+ genCanonicalLoopNest(converter, symTable, semaCtx, eval, child, loc, queue,
+ item, 1, canonLoops);
+
+ auto cli = llvm::getSingleElement(canonLoops).getCli();
+ applyees.push_back(cli);
+ }
+ // One generated loop + one for each loop not inside the specified looprange
+ // if present
+ llvm::SmallVector<mlir::Value> generatees;
+ int64_t numGeneratees = count == 0 ? 1 : applyees.size() - count + 1;
+ for (int i = 0; i < numGeneratees; i++) {
+ auto fusedCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
+ generatees.push_back(fusedCLI);
+ }
+ auto op = mlir::omp::FuseOp::create(firOpBuilder, loc, generatees, applyees);
+
+ if (count != 0) {
+ mlir::IntegerAttr firstAttr = firOpBuilder.getI32IntegerAttr(first);
+ mlir::IntegerAttr countAttr = firOpBuilder.getI32IntegerAttr(count);
+ op->setAttr("first", firstAttr);
+ op->setAttr("count", countAttr);
+ }
+}
+
static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
Fortran::lower::SymMap &symTable,
lower::StatementContext &stmtCtx,
@@ -2233,7 +2287,8 @@ static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
// Emit the associated loop
llvm::SmallVector<mlir::omp::CanonicalLoopOp, 1> canonLoops;
- genCanonicalLoopNest(converter, symTable, semaCtx, eval, loc, queue, item, 1,
+ genCanonicalLoopNest(converter, symTable, semaCtx, eval,
+ eval.getFirstNestedEvaluation(), loc, queue, item, 1,
canonLoops);
llvm::SmallVector<mlir::Value, 1> applyees;
@@ -3507,13 +3562,9 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
case llvm::omp::Directive::OMPD_tile:
genTileOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
break;
- case llvm::omp::Directive::OMPD_fuse: {
- unsigned version = semaCtx.langOptions().OpenMPVersion;
- if (!semaCtx.langOptions().OpenMPSimd)
- TODO(loc, "Unhandled loop directive (" +
- llvm::omp::getOpenMPDirectiveName(dir, version) + ")");
+ case llvm::omp::Directive::OMPD_fuse:
+ genFuseOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
break;
- }
case llvm::omp::Directive::OMPD_unroll:
genUnrollOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
break;
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 7d7a4869ab3a6..913e4d1e69500 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -812,13 +812,14 @@ void collectTileSizesFromOpenMPConstruct(
int64_t collectLoopRelatedInfo(
lower::AbstractConverter &converter, mlir::Location currentLocation,
- lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
+ lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval,
+ const omp::List<omp::Clause> &clauses,
mlir::omp::LoopRelatedClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
int64_t numCollapse = 1;
// Collect the loops to collapse.
- lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
+ lower::pft::Evaluation *doConstructEval = &nestedEval;
if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
}
@@ -830,21 +831,21 @@ int64_t collectLoopRelatedInfo(
numCollapse = collapseValue;
}
- collectLoopRelatedInfo(converter, currentLocation, eval, numCollapse, result,
- iv);
+ collectLoopRelatedInfo(converter, currentLocation, eval, nestedEval,
+ numCollapse, result, iv);
return numCollapse;
}
void collectLoopRelatedInfo(
lower::AbstractConverter &converter, mlir::Location currentLocation,
- lower::pft::Evaluation &eval, int64_t numCollapse,
- mlir::omp::LoopRelatedClauseOps &result,
+ lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval,
+ int64_t numCollapse, mlir::omp::LoopRelatedClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &iv) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
// Collect the loops to collapse.
- lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
+ lower::pft::Evaluation *doConstructEval = &nestedEval;
if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
}
@@ -852,10 +853,15 @@ void collectLoopRelatedInfo(
// Collect sizes from tile directive if present.
std::int64_t sizesLengthValue = 0l;
if (auto *ompCons{eval.getIf<parser::OpenMPConstruct>()}) {
- processTileSizesFromOpenMPConstruct(
- ompCons, [&](const parser::OmpClause::Sizes *tclause) {
- sizesLengthValue = tclause->v.size();
- });
+ if (auto *ompLoop{std::get_if<parser::OpenMPLoopConstruct>(&ompCons->u)}) {
+ const parser::OmpDirectiveSpecification &beginSpec{ompLoop->BeginDir()};
+ if (beginSpec.DirId() == llvm::omp::Directive::OMPD_tile) {
+ processTileSizesFromOpenMPConstruct(
+ ompCons, [&](const parser::OmpClause::Sizes *tclause) {
+ sizesLengthValue = tclause->v.size();
+ });
+ }
+ }
}
std::int64_t collapseValue = std::max(numCollapse, sizesLengthValue);
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 2960b663b08b2..886a5c1835f7e 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -169,13 +169,15 @@ void lastprivateModifierNotSupported(const omp::clause::Lastprivate &lastp,
int64_t collectLoopRelatedInfo(
lower::AbstractConverter &converter, mlir::Location currentLocation,
- lower::pft::Evaluation &eval, const omp::List<omp::Clause> &clauses,
+ lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval,
+ const omp::List<omp::Clause> &clauses,
mlir::omp::LoopRelatedClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
void collectLoopRelatedInfo(
lower::AbstractConverter &converter, mlir::Location currentLocation,
- lower::pft::Evaluation &eval, std::int64_t collapseValue,
+ lower::pft::Evaluation &eval, lower::pft::Evaluation &nestedEval,
+ std::int64_t collapseValue,
// const omp::List<omp::Clause> &clauses,
mlir::omp::LoopRelatedClauseOps &result,
llvm::SmallVectorImpl<const semantics::Symbol *> &iv);
diff --git a/flang/test/Lower/OpenMP/fuse01.f90 b/flang/test/Lower/OpenMP/fuse01.f90
new file mode 100644
index 0000000000000..1377bf3e9c529
--- /dev/null
+++ b/flang/test/Lower/OpenMP/fuse01.f90
@@ -0,0 +1,93 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s
+
+
+subroutine omp_fuse01(lb1, ub1, inc1, lb2, ub2, inc2)
+ integer res, i, j
+ integer lb1, ub1, inc1
+ integer lb2, ub2, inc2
+
+ !$omp fuse
+ do i = lb1, ub1, inc1
+ res = i
+ end do
+ do j = lb2, ub2, inc2
+ res = j
+ end do
+ !$omp end fuse
+
+end subroutine omp_fuse01
+
+
+! CHECK-LABEL: func.func @_QPomp_fuse01(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb1"},
+! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub1"},
+! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc1"},
+! CHECK-SAME: %[[ARG3:.*]]: !fir.ref<i32> {fir.bindc_name = "lb2"},
+! CHECK-SAME: %[[ARG4:.*]]: !fir.ref<i32> {fir.bindc_name = "ub2"},
+! CHECK-SAME: %[[ARG5:.*]]: !fir.ref<i32> {fir.bindc_name = "inc2"}) {
+! CHECK: %[[DUMMY_SCOPE_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_fuse01Ei"}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "_QFomp_fuse01Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Einc1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Einc2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_fuse01Ej"}
+! CHECK: %[[DECLARE_3:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFomp_fuse01Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Elb1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Elb2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_2:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_fuse01Eres"}
+! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFomp_fuse01Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Eub1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse01Eub2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DECLARE_4]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DECLARE_7]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref<i32>
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : i32
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_2]], %[[CONSTANT_0]] : i32
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_0]], %[[LOAD_2]] : i32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[SUBI_0]], %[[LOAD_2]] : i32
+! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[LOAD_0]], %[[LOAD_1]] : i32
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[SELECT_2]], %[[SELECT_1]] overflow<nuw> : i32
+! CHECK: %[[DIVUI_0:.*]] = arith.divui %[[SUBI_1]], %[[SELECT_0]] : i32
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[DIVUI_0]], %[[CONSTANT_1]] overflow<nuw> : i32
+! CHECK: %[[CMPI_1:.*]] = arith.cmpi slt, %[[SELECT_2]], %[[SELECT_1]] : i32
+! CHECK: %[[SELECT_3:.*]] = arith.select %[[CMPI_1]], %[[CONSTANT_0]], %[[ADDI_0]] : i32
+! CHECK: %[[NEW_CLI_0:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[NEW_CLI_0]]) %[[VAL_0:.*]] : i32 in range(%[[SELECT_3]]) {
+! CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[LOAD_2]] : i32
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_0]], %[[MULI_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_1]] to %[[DECLARE_0]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD_3:.*]] = fir.load %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[LOAD_3]] to %[[DECLARE_6]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: %[[LOAD_4:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_5:.*]] = fir.load %[[DECLARE_8]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_6:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref<i32>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i32
+! CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %[[LOAD_6]], %[[CONSTANT_2]] : i32
+! CHECK: %[[SUBI_2:.*]] = arith.subi %[[CONSTANT_2]], %[[LOAD_6]] : i32
+! CHECK: %[[SELECT_4:.*]] = arith.select %[[CMPI_2]], %[[SUBI_2]], %[[LOAD_6]] : i32
+! CHECK: %[[SELECT_5:.*]] = arith.select %[[CMPI_2]], %[[LOAD_5]], %[[LOAD_4]] : i32
+! CHECK: %[[SELECT_6:.*]] = arith.select %[[CMPI_2]], %[[LOAD_4]], %[[LOAD_5]] : i32
+! CHECK: %[[SUBI_3:.*]] = arith.subi %[[SELECT_6]], %[[SELECT_5]] overflow<nuw> : i32
+! CHECK: %[[DIVUI_1:.*]] = arith.divui %[[SUBI_3]], %[[SELECT_4]] : i32
+! CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_1]], %[[CONSTANT_3]] overflow<nuw> : i32
+! CHECK: %[[CMPI_3:.*]] = arith.cmpi slt, %[[SELECT_6]], %[[SELECT_5]] : i32
+! CHECK: %[[SELECT_7:.*]] = arith.select %[[CMPI_3]], %[[CONSTANT_2]], %[[ADDI_2]] : i32
+! CHECK: %[[NEW_CLI_1:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[NEW_CLI_1]]) %[[VAL_1:.*]] : i32 in range(%[[SELECT_7]]) {
+! CHECK: %[[MULI_1:.*]] = arith.muli %[[VAL_1]], %[[LOAD_6]] : i32
+! CHECK: %[[ADDI_3:.*]] = arith.addi %[[LOAD_4]], %[[MULI_1]] : i32
+! CHECK: hlfir.assign %[[ADDI_3]] to %[[DECLARE_3]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD_7:.*]] = fir.load %[[DECLARE_3]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[LOAD_7]] to %[[DECLARE_6]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: %[[NEW_CLI_2:.*]] = omp.new_cli
+! CHECK: omp.fuse (%[[NEW_CLI_2]]) <- (%[[NEW_CLI_0]], %[[NEW_CLI_1]])
+! CHECK: return
+! CHECK: }
+
diff --git a/flang/test/Lower/OpenMP/fuse02.f90 b/flang/test/Lower/OpenMP/fuse02.f90
new file mode 100644
index 0000000000000..5a0f37827c36a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/fuse02.f90
@@ -0,0 +1,123 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 -o - %s | FileCheck %s
+
+
+subroutine omp_fuse02(lb1, ub1, inc1, lb2, ub2, inc2)
+ integer res, i, j, k
+ integer lb1, ub1, inc1
+ integer lb2, ub2, inc2
+
+ !$omp fuse looprange(2,2)
+ do i = lb1, ub1, inc1
+ res = i
+ end do
+ do j = lb2, ub2, inc2
+ res = j
+ end do
+ do k = lb1, ub2, inc1
+ res = k
+ end do
+ !$omp end fuse
+
+end subroutine omp_fuse02
+
+
+! CHECK-LABEL: func.func @_QPomp_fuse02(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "lb1"},
+! CHECK-SAME: %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "ub1"},
+! CHECK-SAME: %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "inc1"},
+! CHECK-SAME: %[[ARG3:.*]]: !fir.ref<i32> {fir.bindc_name = "lb2"},
+! CHECK-SAME: %[[ARG4:.*]]: !fir.ref<i32> {fir.bindc_name = "ub2"},
+! CHECK-SAME: %[[ARG5:.*]]: !fir.ref<i32> {fir.bindc_name = "inc2"}) {
+! CHECK: %[[DUMMY_SCOPE_0:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK: %[[ALLOCA_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_fuse02Ei"}
+! CHECK: %[[DECLARE_0:.*]]:2 = hlfir.declare %[[ALLOCA_0]] {uniq_name = "_QFomp_fuse02Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_1:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Einc1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_2:.*]]:2 = hlfir.declare %[[ARG5]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Einc2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFomp_fuse02Ej"}
+! CHECK: %[[DECLARE_3:.*]]:2 = hlfir.declare %[[ALLOCA_1]] {uniq_name = "_QFomp_fuse02Ej"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_2:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFomp_fuse02Ek"}
+! CHECK: %[[DECLARE_4:.*]]:2 = hlfir.declare %[[ALLOCA_2]] {uniq_name = "_QFomp_fuse02Ek"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Elb1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_6:.*]]:2 = hlfir.declare %[[ARG3]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Elb2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOCA_3:.*]] = fir.alloca i32 {bindc_name = "res", uniq_name = "_QFomp_fuse02Eres"}
+! CHECK: %[[DECLARE_7:.*]]:2 = hlfir.declare %[[ALLOCA_3]] {uniq_name = "_QFomp_fuse02Eres"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_8:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Eub1"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[DECLARE_9:.*]]:2 = hlfir.declare %[[ARG4]] dummy_scope %[[DUMMY_SCOPE_0]] arg {{[0-9]+}} {uniq_name = "_QFomp_fuse02Eub2"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[LOAD_0:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_1:.*]] = fir.load %[[DECLARE_8]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_2:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref<i32>
+! CHECK: %[[CONSTANT_0:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : i32
+! CHECK: %[[CMPI_0:.*]] = arith.cmpi slt, %[[LOAD_2]], %[[CONSTANT_0]] : i32
+! CHECK: %[[SUBI_0:.*]] = arith.subi %[[CONSTANT_0]], %[[LOAD_2]] : i32
+! CHECK: %[[SELECT_0:.*]] = arith.select %[[CMPI_0]], %[[SUBI_0]], %[[LOAD_2]] : i32
+! CHECK: %[[SELECT_1:.*]] = arith.select %[[CMPI_0]], %[[LOAD_1]], %[[LOAD_0]] : i32
+! CHECK: %[[SELECT_2:.*]] = arith.select %[[CMPI_0]], %[[LOAD_0]], %[[LOAD_1]] : i32
+! CHECK: %[[SUBI_1:.*]] = arith.subi %[[SELECT_2]], %[[SELECT_1]] overflow<nuw> : i32
+! CHECK: %[[DIVUI_0:.*]] = arith.divui %[[SUBI_1]], %[[SELECT_0]] : i32
+! CHECK: %[[ADDI_0:.*]] = arith.addi %[[DIVUI_0]], %[[CONSTANT_1]] overflow<nuw> : i32
+! CHECK: %[[CMPI_1:.*]] = arith.cmpi slt, %[[SELECT_2]], %[[SELECT_1]] : i32
+! CHECK: %[[SELECT_3:.*]] = arith.select %[[CMPI_1]], %[[CONSTANT_0]], %[[ADDI_0]] : i32
+! CHECK: %[[NEW_CLI_0:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[NEW_CLI_0]]) %[[VAL_0:.*]] : i32 in range(%[[SELECT_3]]) {
+! CHECK: %[[MULI_0:.*]] = arith.muli %[[VAL_0]], %[[LOAD_2]] : i32
+! CHECK: %[[ADDI_1:.*]] = arith.addi %[[LOAD_0]], %[[MULI_0]] : i32
+! CHECK: hlfir.assign %[[ADDI_1]] to %[[DECLARE_0]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD_3:.*]] = fir.load %[[DECLARE_0]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[LOAD_3]] to %[[DECLARE_7]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: %[[LOAD_4:.*]] = fir.load %[[DECLARE_6]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_5:.*]] = fir.load %[[DECLARE_9]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_6:.*]] = fir.load %[[DECLARE_2]]#0 : !fir.ref<i32>
+! CHECK: %[[CONSTANT_2:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : i32
+! CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %[[LOAD_6]], %[[CONSTANT_2]] : i32
+! CHECK: %[[SUBI_2:.*]] = arith.subi %[[CONSTANT_2]], %[[LOAD_6]] : i32
+! CHECK: %[[SELECT_4:.*]] = arith.select %[[CMPI_2]], %[[SUBI_2]], %[[LOAD_6]] : i32
+! CHECK: %[[SELECT_5:.*]] = arith.select %[[CMPI_2]], %[[LOAD_5]], %[[LOAD_4]] : i32
+! CHECK: %[[SELECT_6:.*]] = arith.select %[[CMPI_2]], %[[LOAD_4]], %[[LOAD_5]] : i32
+! CHECK: %[[SUBI_3:.*]] = arith.subi %[[SELECT_6]], %[[SELECT_5]] overflow<nuw> : i32
+! CHECK: %[[DIVUI_1:.*]] = arith.divui %[[SUBI_3]], %[[SELECT_4]] : i32
+! CHECK: %[[ADDI_2:.*]] = arith.addi %[[DIVUI_1]], %[[CONSTANT_3]] overflow<nuw> : i32
+! CHECK: %[[CMPI_3:.*]] = arith.cmpi slt, %[[SELECT_6]], %[[SELECT_5]] : i32
+! CHECK: %[[SELECT_7:.*]] = arith.select %[[CMPI_3]], %[[CONSTANT_2]], %[[ADDI_2]] : i32
+! CHECK: %[[NEW_CLI_1:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[NEW_CLI_1]]) %[[VAL_1:.*]] : i32 in range(%[[SELECT_7]]) {
+! CHECK: %[[MULI_1:.*]] = arith.muli %[[VAL_1]], %[[LOAD_6]] : i32
+! CHECK: %[[ADDI_3:.*]] = arith.addi %[[LOAD_4]], %[[MULI_1]] : i32
+! CHECK: hlfir.assign %[[ADDI_3]] to %[[DECLARE_3]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD_7:.*]] = fir.load %[[DECLARE_3]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[LOAD_7]] to %[[DECLARE_7]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: %[[LOAD_8:.*]] = fir.load %[[DECLARE_5]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_9:.*]] = fir.load %[[DECLARE_9]]#0 : !fir.ref<i32>
+! CHECK: %[[LOAD_10:.*]] = fir.load %[[DECLARE_1]]#0 : !fir.ref<i32>
+! CHECK: %[[CONSTANT_4:.*]] = arith.constant 0 : i32
+! CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : i32
+! CHECK: %[[CMPI_4:.*]] = arith.cmpi slt, %[[LOAD_10]], %[[CONSTANT_4]] : i32
+! CHECK: %[[SUBI_4:.*]] = arith.subi %[[CONSTANT_4]], %[[LOAD_10]] : i32
+! CHECK: %[[SELECT_8:.*]] = arith.select %[[CMPI_4]], %[[SUBI_4]], %[[LOAD_10]] : i32
+! CHECK: %[[SELECT_9:.*]] = arith.select %[[CMPI_4]], %[[LOAD_9]], %[[LOAD_8]] : i32
+! CHECK: %[[SELECT_10:.*]] = arith.select %[[CMPI_4]], %[[LOAD_8]], %[[LOAD_9]] : i32
+! CHECK: %[[SUBI_5:.*]] = arith.subi %[[SELECT_10]], %[[SELECT_9]] overflow<nuw> : i32
+! CHECK: %[[DIVUI_2:.*]] = arith.divui %[[SUBI_5]], %[[SELECT_8]] : i32
+! CHECK: %[[ADDI_4:.*]] = arith.addi %[[DIVUI_2]], %[[CONSTANT_5]] overflow<nuw> : i32
+! CHECK: %[[CMPI_5:.*]] = arith.cmpi slt, %[[SELECT_10]], %[[SELECT_9]] : i32
+! CHECK: %[[SELECT_11:.*]] = arith.select %[[CMPI_5]], %[[CONSTANT_4]], %[[ADDI_4]] : i32
+! CHECK: %[[NEW_CLI_2:.*]] = omp.new_cli
+! CHECK: omp.canonical_loop(%[[NEW_CLI_2]]) %[[VAL_2:.*]] : i32 in range(%[[SELECT_11]]) {
+! CHECK: %[[MULI_2:.*]] = arith.muli %[[VAL_2]], %[[LOAD_10]] : i32
+! CHECK: %[[ADDI_5:.*]] = arith.addi %[[LOAD_8]], %[[MULI_2]] : i32
+! CHECK: hlfir.assign %[[ADDI_5]] to %[[DECLARE_4]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[LOAD_11:.*]] = fir.load %[[DECLARE_4]]#0 : !fir.ref<i32>
+! CHECK: hlfir.assign %[[LOAD_11]] to %[[DECLARE_7]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
+! CHECK: }
+! CHECK: %[[NEW_CLI_3:.*]] = omp.new_cli
+! CHECK: %[[NEW_CLI_4:.*]] = omp.new_cli
+! CHECK: omp.fuse (%[[NEW_CLI_3]], %[[NEW_CLI_4]]) <- (%[[NEW_CLI_0]], %[[NEW_CLI_1]], %[[NEW_CLI_2]]) {count = 2 : i32, first = 2 : i32}
+! CHECK: return
+! CHECK: }
+
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index f864a895a1259..9073aa7afccdd 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1278,6 +1278,59 @@ class OpenMPIRBuilder {
tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
ArrayRef<Value *> TileSizes);
+ /// Fuse a sequence of loops.
+ ///
+ /// Fuses the loops of \p Loops.
+ /// The merging of the loops is done in the following structure:
+ ///
+ /// Example:
+ /// \code
+ /// for (int i = lb0; i < ub0; i += st0) // trip count is calculated as:
+ /// body(i) // tc0 = (ub0 - lb0 + st0) / st0
+ /// for (int j = lb1; j < ub1; j += st1)
+ /// body(j);
+ ///
+ /// ...
+ ///
+ /// for (int k = lbk; j < ubk; j += stk)
+ /// body(k);
+ /// \endcode
+ ///
+ /// After fusing the loops a single loop is left:
+ /// \code
+ /// for (fuse.index = 0; fuse.index < max(tc0, tc1, ... tck); ++fuse.index) {
+ /// if (fuse.index < tc0){
+ /// iv0 = lb0 + st0 * fuse.index;
+ /// original.index0 = iv0
+ /// body(0);
+ /// }
+ /// if (fuse.index < tc1){
+ /// iv1 = lb1 + st1 * fuse.index;
+ /// original.index1 = iv1
+ /// body(1);
+ /// }
+ ///
+ /// ...
+ ///
+ /// if (fuse.index < tck){
+ /// ivk = lbk + stk * fuse.index;
+ /// original.indexk = ivk
+ /// body(k);
+ /// }
+ /// }
+ /// \endcode
+ ///
+ ///
+ /// @param DL Debug location for instructions added by fusion.
+ ///
+ /// @param Loops Loops to fuse. The CanonicalLoopInfo objects are
+ /// invalidated by this method, i.e. should not used after
+ /// fusion.
+ ///
+ /// \returns A single loop generated by the loop fusion
+ LLVM_ABI CanonicalLoopInfo *fuseLoops(DebugLoc DL,
+ ArrayRef<CanonicalLoopInfo *> Loops);
+
/// Fully unroll a loop.
///
/// Instead of unrolling the loop immediately (and duplicating its body
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5101717526263..d99575bd5f8f2 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5815,6 +5815,117 @@ static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
}
}
+CanonicalLoopInfo *
+OpenMPIRBuilder::fuseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops) {
+
+ CanonicalLoopInfo *firstLoop = Loops.front();
+ CanonicalLoopInfo *lastLoop = Loops.back();
+ Function *F = firstLoop->getPreheader()->getParent();
+
+ // Loop control blocks that will become orphaned later
+ SmallVector<BasicBlock *> oldControlBBs;
+ for (CanonicalLoopInfo *Loop : Loops)
+ Loop->collectControlBlocks(oldControlBBs);
+
+ // Collect original trip counts
+ SmallVector<Value *> origTripCounts;
+ for (CanonicalLoopInfo *L : Loops) {
+ assert(L->isValid() && "All input loops must be valid canonical loops");
+ origTripCounts.push_back(L->getTripCount());
+ }
+
+ Builder.SetCurrentDebugLocation(DL);
+
+ // Compute max trip count.
+ // The fused loop will be from 0 to max(origTripCounts)
+ BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
+ F, firstLoop->getHeader());
+ Builder.SetInsertPoint(TCBlock);
+ Value *fusedTripCount = nullptr;
+ for (CanonicalLoopInfo *L : Loops) {
+ assert(L->isValid() && "All loops to fuse must be valid canonical loops");
+ Value *origTripCount = L->getTripCount();
+ if (!fusedTripCount) {
+ fusedTripCount = origTripCount;
+ continue;
+ }
+ Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
+ fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
+ Twine(".omp.fuse.tc"));
+ }
+
+ // Generate new loop
+ CanonicalLoopInfo *fused =
+ createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
+ lastLoop->getLatch(), "fused");
+
+ // Replace original loops with the fused loop
+ // Preheader and After are not considered inside the CLI.
+ // These are used to compute the individual TCs of the loops
+ // so they have to be put before the resulting fused loop.
+ // Moving them up for readability.
+ for (size_t i = 0; i < Loops.size() - 1; ++i) {
+ Loops[i]->getPreheader()->moveBefore(TCBlock);
+ Loops[i]->getAfter()->moveBefore(TCBlock);
+ }
+ lastLoop->getPreheader()->moveBefore(TCBlock);
+
+ for (size_t i = 0; i < Loops.size() - 1; ++i) {
+ redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
+ redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
+ }
+ redirectTo(lastLoop->getPreheader(), TCBlock, DL);
+ redirectTo(TCBlock, fused->getPreheader(), DL);
+ redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
+
+ // Build the fused body
+ // Create new Blocks with conditions that jump to the original loop bodies
+ SmallVector<BasicBlock *> condBBs;
+ SmallVector<Value *> condValues;
+ for (size_t i = 0; i < Loops.size(); ++i) {
+ BasicBlock *condBlock = BasicBlock::Create(
+ F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
+ Builder.SetInsertPoint(condBlock);
+ Value *condValue =
+ Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
+ condBBs.push_back(condBlock);
+ condValues.push_back(condValue);
+ }
+ // Join the condition blocks with the bodies of the original loops
+ redirectTo(fused->getBody(), condBBs[0], DL);
+ for (size_t i = 0; i < Loops.size() - 1; ++i) {
+ Builder.SetInsertPoint(condBBs[i]);
+ Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
+ redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
+ // Replace the IV with the fused IV
+ Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
+ }
+ // Last body jumps to the created end body block
+ Builder.SetInsertPoint(condBBs.back());
+ Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
+ fused->getLatch());
+ redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
+ // Replace the IV with the fused IV
+ lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
+
+ // The loop latch must have only one predecessor. Currently it is branched to
+ // from both the last condition block and the last loop body
+ fused->getLatch()->splitBasicBlock(fused->getLatch()->begin(),
+ "omp.fused.pre_latch", /*Before=*/true);
+
+ // Remove unused parts
+ removeUnusedBlocksFromParent(oldControlBBs);
+
+ // Invalidate old CLIs
+ for (CanonicalLoopInfo *L : Loops)
+ L->invalidate();
+
+#ifndef NDEBUG
+ fused->assertOK();
+#endif
+ return fused;
+}
+
void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
LLVMContext &Ctx = Builder.getContext();
addLoopMetadata(
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 377f1febf6b8f..2752c2a806847 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -550,6 +550,40 @@ def TileOp : OpenMPTransformBase_Op<"tile",
let hasVerifier = 1;
}
+//===----------------------------------------------------------------------===//
+// OpenMP fuse operation
+//===----------------------------------------------------------------------===//
+
+def FuseOp : OpenMPTransformBase_Op<"fuse"> {
+ let summary = "OpenMP fuse operation";
+ let description = [{
+ Represents the OpenMP fuse directive introduced in OpenMP 6.0.
+
+ The construct takes a loop sequence and merges the loops specifed by the
+ first and count attributes and generates a loop sequence with the loops
+ before the first attribute untouched, the generated fused loop, and the loops
+ after the the first + count attribute untouched mantaining the orignal
+ order. If no attributes are specified all the loops in the sequence are
+ fused generating a single loop.
+ Each logical iteration of the fused loop executes a logical iteration of
+ each affected loop. The fused loop has the number of logical iterations
+ equal to the affected loop with most logical iterations.
+
+ The first and count attributes are constant and known beforehand.
+ }]#clausesDescription;
+
+ let extraClassDeclaration = [{
+ IntegerAttr getFirst() {
+ return this->getOperation()->getAttrOfType<mlir::IntegerAttr>("first");
+ }
+ IntegerAttr getCount() {
+ return this->getOperation()->getAttrOfType<mlir::IntegerAttr>("count");
+ }
+ }]#clausesExtraClassDeclaration;
+
+ let hasVerifier = 1;
+}
+
//===----------------------------------------------------------------------===//
// 2.8.3 Workshare Construct
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 1b069c62a8be9..8373a18df281a 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -3429,6 +3429,20 @@ void NewCliOp::getAsmResultNames(OpAsmSetValueNameFn setNameFn) {
.Case([&](UnrollHeuristicOp op) -> std::string {
llvm_unreachable("heuristic unrolling does not generate a loop");
})
+ .Case([&](FuseOp op) -> std::string {
+ unsigned int first = 0;
+ unsigned int count = 0;
+ if (op.getFirst() && op.getCount()) {
+ first = op.getFirst().getInt();
+ count = op.getCount().getInt();
+ }
+ unsigned opnum = generator->getOperandNumber();
+ if ((first != 0 && opnum <= first - 1) ||
+ (count != 0 && opnum >= first + 1))
+ return "canonloop_fuse";
+ else
+ return "fused";
+ })
.Case([&](TileOp op) -> std::string {
auto [generateesFirst, generateesCount] =
op.getGenerateesODSOperandIndexAndLength();
@@ -3804,6 +3818,60 @@ std::pair<unsigned, unsigned> TileOp::getGenerateesODSOperandIndexAndLength() {
return getODSOperandIndexAndLength(odsIndex_generatees);
}
+//===----------------------------------------------------------------------===//
+// FuseOp
+//===----------------------------------------------------------------------===//
+
+static void printLoopTransformClis(OpAsmPrinter &p, FuseOp op,
+ OperandRange generatees,
+ OperandRange applyees) {
+ if (!generatees.empty())
+ p << '(' << llvm::interleaved(generatees) << ')';
+
+ if (!applyees.empty())
+ p << " <- (" << llvm::interleaved(applyees) << ')';
+}
+
+LogicalResult FuseOp::verify() {
+ if (getApplyees().size() < 2)
+ return emitOpError() << "must apply to at least two loops";
+
+ if (getFirst() && getCount()) {
+ unsigned int first = getFirst().getInt();
+ unsigned int count = getCount().getInt();
+ if (first + count - 1 > getApplyees().size())
+ return emitOpError() << "the numbers of applyees must be at least first "
+ "minus one plus count attributes";
+ if (!getGeneratees().empty() &&
+ getGeneratees().size() != getApplyees().size() + 1 - count)
+ return emitOpError() << "the number of generatees must be the number of "
+ "aplyees plus one minus count";
+
+ } else {
+ if (!getGeneratees().empty() && getGeneratees().size() != 1)
+ return emitOpError()
+ << "in a complete fuse the number of generatees must be exactly 1";
+ }
+ for (auto &&applyee : getApplyees()) {
+ auto [create, gen, cons] = decodeCli(applyee);
+
+ if (!gen)
+ return emitOpError() << "applyee CLI has no generator";
+ auto loop = dyn_cast_or_null<CanonicalLoopOp>(gen->getOwner());
+ if (!loop)
+ return emitOpError()
+ << "currently only supports omp.canonical_loop as applyee";
+ }
+ return success();
+}
+std::pair<unsigned, unsigned> FuseOp ::getApplyeesODSOperandIndexAndLength() {
+ return getODSOperandIndexAndLength(odsIndex_applyees);
+}
+
+std::pair<unsigned, unsigned> FuseOp::getGenerateesODSOperandIndexAndLength() {
+ return getODSOperandIndexAndLength(odsIndex_generatees);
+}
+
//===----------------------------------------------------------------------===//
// Critical construct (2.17.1)
//===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8edec990eaaba..e6880ce33b061 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3207,6 +3207,57 @@ static LogicalResult applyTile(omp::TileOp op, llvm::IRBuilderBase &builder,
return success();
}
+/// Apply a `#pragma omp fuse` / `!$omp fuse` transformation using the
+/// OpenMPIRBuilder.
+static LogicalResult applyFuse(omp::FuseOp op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+ llvm::OpenMPIRBuilder::LocationDescription loc(builder);
+
+ unsigned int first = 0;
+ unsigned int count = 0;
+ if (op.getFirst() && op.getCount()) {
+ first = op.getFirst().getInt();
+ count = op.getCount().getInt();
+ }
+
+ // Select what CLIs are going to be fused
+ SmallVector<llvm::CanonicalLoopInfo *> beforeFuse, toFuse, afterFuse;
+ for (size_t i = 0; i < op.getApplyees().size(); i++) {
+ Value applyee = op.getApplyees()[i];
+ llvm::CanonicalLoopInfo *consBuilderCLI =
+ moduleTranslation.lookupOMPLoop(applyee);
+ assert(applyee && "Canonical loop must already been translated");
+ if (first != 0 && i < first - 1)
+ beforeFuse.push_back(consBuilderCLI);
+ else if (count != 0 && i >= first + count - 1)
+ afterFuse.push_back(consBuilderCLI);
+ else
+ toFuse.push_back(consBuilderCLI);
+ }
+ assert(
+ (op.getGeneratees().empty() ||
+ beforeFuse.size() + afterFuse.size() + 1 == op.getGeneratees().size()) &&
+ "Wrong number of generatees");
+
+ // do the fuse
+ auto generatedLoop = ompBuilder->fuseLoops(loc.DL, toFuse);
+ if (!op.getGeneratees().empty()) {
+ size_t i = 0;
+ for (; i < beforeFuse.size(); i++)
+ moduleTranslation.mapOmpLoop(op.getGeneratees()[i], beforeFuse[i]);
+ moduleTranslation.mapOmpLoop(op.getGeneratees()[i++], generatedLoop);
+ for (; i < afterFuse.size(); i++)
+ moduleTranslation.mapOmpLoop(op.getGeneratees()[i], afterFuse[i]);
+ }
+
+ // CLIs can only be consumed once
+ for (Value applyee : op.getApplyees())
+ moduleTranslation.invalidateOmpLoop(applyee);
+
+ return success();
+}
+
/// Convert an Atomic Ordering attribute to llvm::AtomicOrdering.
static llvm::AtomicOrdering
convertAtomicOrdering(std::optional<omp::ClauseMemoryOrderKind> ao) {
@@ -6288,6 +6339,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
.Case([&](omp::TileOp op) {
return applyTile(op, builder, moduleTranslation);
})
+ .Case([&](omp::FuseOp op) {
+ return applyFuse(op, builder, moduleTranslation);
+ })
.Case([&](omp::TargetAllocMemOp) {
return convertTargetAllocMemOp(*op, builder, moduleTranslation);
})
diff --git a/mlir/test/Dialect/OpenMP/cli-fuse.mlir b/mlir/test/Dialect/OpenMP/cli-fuse.mlir
new file mode 100644
index 0000000000000..284b8c914ae1f
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/cli-fuse.mlir
@@ -0,0 +1,114 @@
+// RUN: mlir-opt %s | FileCheck %s --enable-var-scope
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s --enable-var-scope
+
+
+// Raw syntax check (MLIR output is always pretty-printed)
+// CHECK-LABEL: @omp_fuse_raw(
+// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) {
+func.func @omp_fuse_raw(%tc1 : i32, %tc2 : i32) -> () {
+ // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+ %canonloop_s0 = "omp.new_cli" () : () -> (!omp.cli)
+ // CHECK-NEXT: %canonloop_s1 = omp.new_cli
+ %canonloop_s1 = "omp.new_cli" () : () -> (!omp.cli)
+ // CHECK-NEXT: %fused = omp.new_cli
+ %fused = "omp.new_cli" () : () -> (!omp.cli)
+ // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) {
+ "omp.canonical_loop" (%tc1, %canonloop_s0) ({
+ ^bb0(%iv_s0: i32):
+ // CHECK: omp.terminator
+ omp.terminator
+ }) : (i32, !omp.cli) -> ()
+ // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) {
+ "omp.canonical_loop" (%tc2, %canonloop_s1) ({
+ ^bb0(%iv_s1: i32):
+ // CHECK: omp.terminator
+ omp.terminator
+ }) : (i32, !omp.cli) -> ()
+ // CHECK: omp.fuse (%fused) <- (%canonloop_s0, %canonloop_s1)
+ "omp.fuse"(%fused, %canonloop_s0, %canonloop_s1) <{operandSegmentSizes = array<i32: 1, 2>}> : (!omp.cli, !omp.cli, !omp.cli) -> ()
+ return
+}
+
+// Pretty syntax check
+// CHECK-LABEL: @omp_fuse_pretty(
+// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) {
+func.func @omp_fuse_pretty(%tc1 : i32, %tc2 : i32) -> () {
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %canonloop_s0 = omp.new_cli
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %canonloop_s1 = omp.new_cli
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %fused = omp.new_cli
+ // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) {
+ omp.canonical_loop (%canonloop_s0) %iv_s0 : i32 in range(%tc1) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) {
+ omp.canonical_loop (%canonloop_s1) %iv_s1 : i32 in range(%tc2) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: omp.fuse (%fused) <- (%canonloop_s0, %canonloop_s1)
+ omp.fuse(%fused) <- (%canonloop_s0, %canonloop_s1)
+ return
+}
+
+// Specifying the generatees for omp.fuse is optional
+// CHECK-LABEL: @omp_fuse_optionalgen_pretty(
+// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32) {
+func.func @omp_fuse_optionalgen_pretty(%tc1 : i32, %tc2 : i32) -> () {
+ // CHECK-NEXT: %canonloop_s0 = omp.new_cli
+ %canonloop_s0 = omp.new_cli
+ // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) {
+ omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%tc1) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: %canonloop_s1 = omp.new_cli
+ %canonloop_s1 = omp.new_cli
+ // CHECK-NEXT: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) {
+ omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%tc2) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: omp.fuse <- (%canonloop_s0, %canonloop_s1)
+ omp.fuse <- (%canonloop_s0, %canonloop_s1)
+ return
+}
+
+// Fuse with looprange attributes
+// CHECK-LABEL: @omp_fuse_looprange(
+// CHECK-SAME: %[[tc1:.+]]: i32, %[[tc2:.+]]: i32, %[[tc3:.+]]: i32) {
+func.func @omp_fuse_looprange(%tc1 : i32, %tc2 : i32, %tc3 : i32) -> () {
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %canonloop_s0 = omp.new_cli
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %canonloop_s1 = omp.new_cli
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %canonloop_s2 = omp.new_cli
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %canonloop_fuse = omp.new_cli
+ // CHECK-NEXT: %[[CANONLOOP:.+]] = omp.new_cli
+ %fused = omp.new_cli
+ // CHECK-NEXT: omp.canonical_loop(%canonloop_s0) %iv_s0 : i32 in range(%[[tc1]]) {
+ omp.canonical_loop (%canonloop_s0) %iv_s0 : i32 in range(%tc1) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: omp.canonical_loop(%canonloop_s1) %iv_s1 : i32 in range(%[[tc2]]) {
+ omp.canonical_loop (%canonloop_s1) %iv_s1 : i32 in range(%tc2) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: omp.canonical_loop(%canonloop_s2) %iv_s2 : i32 in range(%[[tc3]]) {
+ omp.canonical_loop (%canonloop_s2) %iv_s2 : i32 in range(%tc3) {
+ // CHECK: omp.terminator
+ omp.terminator
+ }
+ // CHECK: omp.fuse (%canonloop_fuse, %fused) <- (%canonloop_s0,
+ // %canonloop_s1, %canonloop_s2) {count = 2 : i32, first = 1 : i32}
+ omp.fuse(%fused, %canonloop_fuse) <- (%canonloop_s0, %canonloop_s1, %canonloop_s2) {count = 2 : i32, first = 1 : i32}
+ return
+}
+
diff --git a/mlir/test/Dialect/OpenMP/invalid-fuse.mlir b/mlir/test/Dialect/OpenMP/invalid-fuse.mlir
new file mode 100644
index 0000000000000..d763ffcea71a2
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/invalid-fuse.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-opt -split-input-file -verify-diagnostics %s
+
+
+func.func @no_loops(%tc1 : i32, %tc2 : i32) {
+ // expected-error at +1 {{'omp.fuse' op must apply to at least two loops}}
+ omp.fuse <-()
+
+ return
+}
+
+// -----
+
+func.func @one_loop(%tc1 : i32, %tc2 : i32) {
+ %canonloop = omp.new_cli
+ omp.canonical_loop(%canonloop) %iv : i32 in range(%tc1) {
+ omp.terminator
+ }
+ // expected-error at +1 {{'omp.fuse' op must apply to at least two loops}}
+ omp.fuse <-(%canonloop)
+
+ return
+}
+
+// -----
+
+func.func @missing_generator(%tc1 : i32, %tc2 : i32) {
+ // expected-error at +1 {{'omp.new_cli' op CLI has no generator}}
+ %canonloop = omp.new_cli
+
+ // expected-note at +1 {{see consumer here: "omp.fuse"(%0) <{operandSegmentSizes = array<i32: 0, 1>}> : (!omp.cli) -> ()}}
+ omp.fuse <-(%canonloop)
+
+ return
+}
+
+// -----
+
+func.func @wrong_generatees1(%tc1 : i32, %tc2 : i32) {
+ %canonloop1 = omp.new_cli
+ %canonloop2 = omp.new_cli
+ omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) {
+ omp.terminator
+ }
+ omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) {
+ omp.terminator
+ }
+
+ %fused1 = omp.new_cli
+ %fused2 = omp.new_cli
+ // expected-error at +1 {{'omp.fuse' op in a complete fuse the number of generatees must be exactly 1}}
+ omp.fuse (%fused1, %fused2) <-(%canonloop1, %canonloop2)
+
+ llvm.return
+}
+
+// -----
+
+func.func @wrong_generatees2(%tc1 : i32, %tc2 : i32, %tc3 : i32) {
+ %canonloop1 = omp.new_cli
+ %canonloop2 = omp.new_cli
+ %canonloop3 = omp.new_cli
+ omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) {
+ omp.terminator
+ }
+ omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) {
+ omp.terminator
+ }
+ omp.canonical_loop(%canonloop3) %iv : i32 in range(%tc3) {
+ omp.terminator
+ }
+
+ %fused = omp.new_cli
+ // expected-error at +1 {{'omp.fuse' op the number of generatees must be the number of aplyees plus one minus count}}
+ omp.fuse (%fused) <-(%canonloop1, %canonloop2, %canonloop3) {first = 1 : i32, count = 2 : i32}
+
+ llvm.return
+}
+
+func.func @wrong_applyees(%tc1 : i32, %tc2 : i32, %tc3 : i32) {
+ %canonloop1 = omp.new_cli
+ %canonloop2 = omp.new_cli
+ %canonloop3 = omp.new_cli
+ omp.canonical_loop(%canonloop1) %iv : i32 in range(%tc1) {
+ omp.terminator
+ }
+ omp.canonical_loop(%canonloop2) %iv : i32 in range(%tc2) {
+ omp.terminator
+ }
+ omp.canonical_loop(%canonloop3) %iv : i32 in range(%tc3) {
+ omp.terminator
+ }
+
+ %fused = omp.new_cli
+ %canonloop_fuse = omp.new_cli
+ // expected-error at +1 {{'omp.fuse' op the numbers of applyees must be at least first minus one plus count attributes}}
+ omp.fuse (%fused, %canonloop_fuse) <-(%canonloop1, %canonloop2, %canonloop3) {first = 1 : i32, count = 5 : i32}
+
+ llvm.return
+}
+
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir b/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir
new file mode 100644
index 0000000000000..0754572b24771
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-fuse01.mlir
@@ -0,0 +1,100 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
+
+
+llvm.func @fuse_trivial_loops(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32) -> () {
+ %literal_cli1 = omp.new_cli
+ omp.canonical_loop(%literal_cli1) %iv1 : i32 in range(%tc1) {
+ %ptr = llvm.getelementptr inbounds %baseptr[%iv1] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+ %val = llvm.mlir.constant(42.0 : f32) : f32
+ llvm.store %val, %ptr : f32, !llvm.ptr
+ omp.terminator
+ }
+ %literal_cli2 = omp.new_cli
+ omp.canonical_loop(%literal_cli2) %iv2 : i32 in range(%tc2) {
+ %ptr = llvm.getelementptr inbounds %baseptr[%iv2] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+ %val = llvm.mlir.constant(21.0 : f32) : f32
+ llvm.store %val, %ptr : f32, !llvm.ptr
+ omp.terminator
+ }
+ omp.fuse <- (%literal_cli1, %literal_cli2)
+ llvm.return
+}
+
+// CHECK-LABEL: define void @fuse_trivial_loops(
+// CHECK-SAME: ptr %[[VAL_11:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_16:.+]]) {
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]:
+// CHECK-NEXT: br label %[[OMP_FUSE_COMP_TC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSE_COMP_TC]]:
+// CHECK-NEXT: %[[VAL_15:.+]] = icmp sgt i32 %[[VAL_5:.+]], %[[VAL_16:.+]]
+// CHECK-NEXT: %[[VAL_17:.+]] = select i1 %[[VAL_15:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_16:.+]]
+// CHECK-NEXT: br label %[[OMP_FUSED_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_PREHEADER]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_HEADER]]:
+// CHECK-NEXT: %[[VAL_4:.+]] = phi i32 [ 0, %[[VAL_18:.+]] ], [ %[[VAL_27:.+]], %[[VAL_26:.+]] ]
+// CHECK-NEXT: br label %[[OMP_FUSED_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_COND]]:
+// CHECK-NEXT: %[[VAL_29:.+]] = icmp ult i32 %[[VAL_4:.+]], %[[VAL_17:.+]]
+// CHECK-NEXT: br i1 %[[VAL_29:.+]], label %[[OMP_FUSED_BODY:.+]], label %[[OMP_FUSED_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_BODY]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_INNER_COND]]:
+// CHECK-NEXT: %[[VAL_3:.+]] = icmp slt i32 %[[VAL_4:.+]], %[[VAL_5:.+]]
+// CHECK-NEXT: br i1 %[[VAL_3:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_FUSED_INNER_COND13:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_REGION]]:
+// CHECK-NEXT: %[[VAL_10:.+]] = getelementptr inbounds float, ptr %[[VAL_11:.+]], i32 %[[VAL_4:.+]]
+// CHECK-NEXT: store float 4.200000e+01, ptr %[[VAL_10:.+]], align 4
+// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_REGION_CONT]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND13:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_INNER_COND13]]:
+// CHECK-NEXT: %[[VAL_19:.+]] = icmp slt i32 %[[VAL_4:.+]], %[[VAL_16:.+]]
+// CHECK-NEXT: br i1 %[[VAL_19:.+]], label %[[OMP_OMP_LOOP_BODY4:.+]], label %[[OMP_FUSED_PRE_LATCH:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_REGION12]]:
+// CHECK-NEXT: %[[VAL_23:.+]] = getelementptr inbounds float, ptr %[[VAL_11:.+]], i32 %[[VAL_4:.+]]
+// CHECK-NEXT: store float 2.100000e+01, ptr %[[VAL_23:.+]], align 4
+// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_REGION_CONT11]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_PRE_LATCH:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_PRE_LATCH]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_INC]]:
+// CHECK-NEXT: %[[VAL_27:.+]] = add nuw i32 %[[VAL_4:.+]], 1
+// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_EXIT]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_AFTER]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER7:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER7]]:
+// CHECK-NEXT: ret void
+
diff --git a/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir b/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir
new file mode 100644
index 0000000000000..0032bd86501d0
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-cli-fuse02.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s --enable-var-scope
+
+
+llvm.func @fuse_looprange_loops(%baseptr: !llvm.ptr, %tc1: i32, %tc2: i32, %tc3: i32) -> () {
+ %literal_cli1 = omp.new_cli
+ omp.canonical_loop(%literal_cli1) %iv1 : i32 in range(%tc1) {
+ %ptr = llvm.getelementptr inbounds %baseptr[%iv1] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+ %val = llvm.mlir.constant(42.0 : f32) : f32
+ llvm.store %val, %ptr : f32, !llvm.ptr
+ omp.terminator
+ }
+ %literal_cli2 = omp.new_cli
+ omp.canonical_loop(%literal_cli2) %iv2 : i32 in range(%tc2) {
+ %ptr = llvm.getelementptr inbounds %baseptr[%iv2] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+ %val = llvm.mlir.constant(21.0 : f32) : f32
+ llvm.store %val, %ptr : f32, !llvm.ptr
+ omp.terminator
+ }
+ %literal_cli3 = omp.new_cli
+ omp.canonical_loop(%literal_cli3) %iv3 : i32 in range(%tc3) {
+ %ptr = llvm.getelementptr inbounds %baseptr[%iv3] : (!llvm.ptr, i32) -> !llvm.ptr, f32
+ %val = llvm.mlir.constant(63.0 : f32) : f32
+ llvm.store %val, %ptr : f32, !llvm.ptr
+ omp.terminator
+ }
+ omp.fuse <- (%literal_cli1, %literal_cli2, %literal_cli3) {first = 1 : i32, count = 2 : i32}
+ llvm.return
+}
+
+
+// CHECK-LABEL: define void @fuse_looprange_loops(
+// CHECK-SAME: ptr %[[VAL_23:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_6:.+]], i32 %[[VAL_40:.+]]) {
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER1:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER1]]:
+// CHECK-NEXT: br label %[[OMP_FUSE_COMP_TC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSE_COMP_TC]]:
+// CHECK-NEXT: %[[VAL_4:.+]] = icmp sgt i32 %[[VAL_5:.+]], %[[VAL_6:.+]]
+// CHECK-NEXT: %[[VAL_7:.+]] = select i1 %[[VAL_4:.+]], i32 %[[VAL_5:.+]], i32 %[[VAL_6:.+]]
+// CHECK-NEXT: br label %[[OMP_FUSED_PREHEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_PREHEADER]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_HEADER]]:
+// CHECK-NEXT: %[[VAL_11:.+]] = phi i32 [ 0, %[[VAL_8:.+]] ], [ %[[VAL_12:.+]], %[[VAL_10:.+]] ]
+// CHECK-NEXT: br label %[[OMP_FUSED_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_COND]]:
+// CHECK-NEXT: %[[VAL_14:.+]] = icmp ult i32 %[[VAL_11:.+]], %[[VAL_7:.+]]
+// CHECK-NEXT: br i1 %[[VAL_14:.+]], label %[[OMP_FUSED_BODY:.+]], label %[[OMP_FUSED_EXIT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_BODY]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_INNER_COND]]:
+// CHECK-NEXT: %[[VAL_18:.+]] = icmp slt i32 %[[VAL_11:.+]], %[[VAL_5:.+]]
+// CHECK-NEXT: br i1 %[[VAL_18:.+]], label %[[OMP_OMP_LOOP_BODY:.+]], label %[[OMP_FUSED_INNER_COND25:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_BODY]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_REGION:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_REGION]]:
+// CHECK-NEXT: %[[VAL_22:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_11:.+]]
+// CHECK-NEXT: store float 4.200000e+01, ptr %[[VAL_22:.+]], align 4
+// CHECK-NEXT: br label %[[OMP_REGION_CONT:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_REGION_CONT]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_INNER_COND25:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_INNER_COND25]]:
+// CHECK-NEXT: %[[VAL_25:.+]] = icmp slt i32 %[[VAL_11:.+]], %[[VAL_6:.+]]
+// CHECK-NEXT: br i1 %[[VAL_25:.+]], label %[[OMP_OMP_LOOP_BODY4:.+]], label %[[OMP_FUSED_PRE_LATCH:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_BODY4]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_REGION12:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_REGION12]]:
+// CHECK-NEXT: %[[VAL_29:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_11:.+]]
+// CHECK-NEXT: store float 2.100000e+01, ptr %[[VAL_29:.+]], align 4
+// CHECK-NEXT: br label %[[OMP_REGION_CONT11:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_REGION_CONT11]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_PRE_LATCH:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_PRE_LATCH]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_INC:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_INC]]:
+// CHECK-NEXT: %[[VAL_12:.+]] = add nuw i32 %[[VAL_11:.+]], 1
+// CHECK-NEXT: br label %[[OMP_FUSED_HEADER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_EXIT]]:
+// CHECK-NEXT: br label %[[OMP_FUSED_AFTER:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_FUSED_AFTER]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER7:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER7]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_PREHEADER13:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_PREHEADER13]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER14:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_HEADER14]]:
+// CHECK-NEXT: %[[VAL_36:.+]] = phi i32 [ 0, %[[VAL_33:.+]] ], [ %[[VAL_37:.+]], %[[VAL_35:.+]] ]
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_COND15:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_COND15]]:
+// CHECK-NEXT: %[[VAL_39:.+]] = icmp ult i32 %[[VAL_36:.+]], %[[VAL_40:.+]]
+// CHECK-NEXT: br i1 %[[VAL_39:.+]], label %[[OMP_OMP_LOOP_BODY16:.+]], label %[[OMP_OMP_LOOP_EXIT18:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_BODY16]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_REGION24:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_LOOP_REGION24]]:
+// CHECK-NEXT: %[[VAL_44:.+]] = getelementptr inbounds float, ptr %[[VAL_23:.+]], i32 %[[VAL_36:.+]]
+// CHECK-NEXT: store float 6.300000e+01, ptr %[[VAL_44:.+]], align 4
+// CHECK-NEXT: br label %[[OMP_REGION_CONT23:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_REGION_CONT23]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_INC17:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_INC17]]:
+// CHECK-NEXT: %[[VAL_37:.+]] = add nuw i32 %[[VAL_36:.+]], 1
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_HEADER14:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_EXIT18]]:
+// CHECK-NEXT: br label %[[OMP_OMP_LOOP_AFTER19:.+]]
+// CHECK-EMPTY:
+// CHECK-NEXT: [[OMP_OMP_LOOP_AFTER19]]:
+// CHECK-NEXT: ret void
+
diff --git a/openmp/runtime/test/transform/fuse/do-looprange.f90 b/openmp/runtime/test/transform/fuse/do-looprange.f90
new file mode 100644
index 0000000000000..8c62b24c4744f
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/do-looprange.f90
@@ -0,0 +1,60 @@
+! RUN: %flang %flags %openmp_flags -fopenmp-version=60 %s -o %t.exe
+! RUN: %t.exe | FileCheck %s --match-full-lines
+
+program fuse_full
+ implicit none
+ integer i, j, k, u
+
+ print *, 'do'
+
+ !$OMP FUSE LOOPRANGE(2,2)
+ do i=5, 25, 5
+ print '("i=", I0)', i
+ end do
+ do j=10, 100, 10
+ print '("j=", I0)', j
+ end do
+ do k=10, 0, -1
+ print '("k=", I0)', k
+ end do
+ do u=5, 25, 5
+ print '("u=", I0)', u
+ end do
+ !$OMP END FUSE
+
+ print *, 'done'
+end program
+
+! CHECK: do
+! CHECK-NEXT: i=5
+! CHECK-NEXT: i=10
+! CHECK-NEXT: i=15
+! CHECK-NEXT: i=20
+! CHECK-NEXT: i=25
+! CHECK-NEXT: j=10
+! CHECK-NEXT: k=10
+! CHECK-NEXT: j=20
+! CHECK-NEXT: k=9
+! CHECK-NEXT: j=30
+! CHECK-NEXT: k=8
+! CHECK-NEXT: j=40
+! CHECK-NEXT: k=7
+! CHECK-NEXT: j=50
+! CHECK-NEXT: k=6
+! CHECK-NEXT: j=60
+! CHECK-NEXT: k=5
+! CHECK-NEXT: j=70
+! CHECK-NEXT: k=4
+! CHECK-NEXT: j=80
+! CHECK-NEXT: k=3
+! CHECK-NEXT: j=90
+! CHECK-NEXT: k=2
+! CHECK-NEXT: j=100
+! CHECK-NEXT: k=1
+! CHECK-NEXT: k=0
+! CHECK-NEXT: u=5
+! CHECK-NEXT: u=10
+! CHECK-NEXT: u=15
+! CHECK-NEXT: u=20
+! CHECK-NEXT: u=25
+! CHECK-NEXT: done
diff --git a/openmp/runtime/test/transform/fuse/do.f90 b/openmp/runtime/test/transform/fuse/do.f90
new file mode 100644
index 0000000000000..d4496bce4d723
--- /dev/null
+++ b/openmp/runtime/test/transform/fuse/do.f90
@@ -0,0 +1,52 @@
+! RUN: %flang %flags %openmp_flags -fopenmp-version=60 %s -o %t.exe
+! RUN: %t.exe | FileCheck %s --match-full-lines
+
+program fuse_full
+ implicit none
+ integer i, j, k
+
+ print *, 'do'
+
+ !$OMP FUSE
+ do i=5, 25, 5
+ print '("i=", I0)', i
+ end do
+ do j=10, 100, 10
+ print '("j=", I0)', j
+ end do
+ do k=10, 0, -1
+ print '("k=", I0)', k
+ end do
+ !$OMP END FUSE
+
+ print *, 'done'
+end program
+
+! CHECK: do
+! CHECK-NEXT: i=5
+! CHECK-NEXT: j=10
+! CHECK-NEXT: k=10
+! CHECK-NEXT: i=10
+! CHECK-NEXT: j=20
+! CHECK-NEXT: k=9
+! CHECK-NEXT: i=15
+! CHECK-NEXT: j=30
+! CHECK-NEXT: k=8
+! CHECK-NEXT: i=20
+! CHECK-NEXT: j=40
+! CHECK-NEXT: k=7
+! CHECK-NEXT: i=25
+! CHECK-NEXT: j=50
+! CHECK-NEXT: k=6
+! CHECK-NEXT: j=60
+! CHECK-NEXT: k=5
+! CHECK-NEXT: j=70
+! CHECK-NEXT: k=4
+! CHECK-NEXT: j=80
+! CHECK-NEXT: k=3
+! CHECK-NEXT: j=90
+! CHECK-NEXT: k=2
+! CHECK-NEXT: j=100
+! CHECK-NEXT: k=1
+! CHECK-NEXT: k=0
+! CHECK-NEXT: done
More information about the llvm-commits
mailing list