[Mlir-commits] [mlir] [mlir][acc] Add utilities for converting acc.loop to scf (PR #172953)
Razvan Lupusoru
llvmlistbot at llvm.org
Mon Dec 22 09:59:41 PST 2025
https://github.com/razvanlupusoru updated https://github.com/llvm/llvm-project/pull/172953
>From d1121ab1c878f4577436a3a3d6d9cf7cba53bfd4 Mon Sep 17 00:00:00 2001
From: Scott Manley <rscottmanley at nvidia.com>
Date: Thu, 18 Dec 2025 21:19:09 -0800
Subject: [PATCH 1/7] [mlir][acc] Add utilities for converting acc.loop to scf
Add OpenACCUtilsLoop.h/.cpp with utilities for converting acc.loop
operations to SCF dialect operations:
- convertACCLoopToSCFFor: Convert structured acc.loop to scf.for
with loop collapsing support
- convertACCLoopToSCFParallel: Convert acc.loop to scf.parallel
- convertUnstructuredACCLoopToSCFExecuteRegion: Convert unstructured
acc.loop (multi-block) to scf.execute_region
Key features:
- Automatic type conversion between integer types and index
- Inclusive-to-exclusive upper bound conversion
- Trip count calculation with clamping for negative counts
- Constant folding via createOrFold for cleaner IR
- Assertions to prevent misuse (e.g., builder inside loop region)
- Error emission for unsupported cases (loops with results)
Comprehensive unit tests covering these APIs are also added.
---
.../mlir/Dialect/OpenACC/OpenACCUtilsLoop.h | 54 ++
mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt | 5 +-
.../OpenACC/Utils/OpenACCUtilsLoop.cpp | 323 ++++++++++
mlir/unittests/Dialect/OpenACC/CMakeLists.txt | 4 +
.../Dialect/OpenACC/OpenACCUtilsLoopTest.cpp | 597 ++++++++++++++++++
5 files changed, 982 insertions(+), 1 deletion(-)
create mode 100644 mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
create mode 100644 mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
create mode 100644 mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
new file mode 100644
index 0000000000000..d2e7174fd306a
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
@@ -0,0 +1,54 @@
+//===- OpenACCUtilsLoop.h - OpenACC Loop Utilities --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utilities for converting OpenACC loop operations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
+#define MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
+
+namespace mlir {
+class OpBuilder;
+namespace scf {
+class ForOp;
+class ParallelOp;
+class ExecuteRegionOp;
+} // namespace scf
+namespace acc {
+class LoopOp;
+
+/// Convert a structured acc.loop to scf.for.
+/// The loop arguments are converted to index type. If enableCollapse is true,
+/// nested loops are collapsed into a single loop.
+/// @param loopOp The acc.loop operation to convert (must not be unstructured)
+/// @param enableCollapse Whether to collapse nested loops into one
+/// @return The created scf.for operation or nullptr on creation error.
+/// An InFlightDiagnostic is emitted on creation error.
+scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, bool enableCollapse);
+
+/// Convert acc.loop to scf.parallel.
+/// The loop induction variables are converted to index types.
+/// @param loopOp The acc.loop operation to convert
+/// @param builder OpBuilder for creating operations
+/// @return The created scf.parallel operation or nullptr on creation error.
+/// An InFlightDiagnostic is emitted on creation error.
+scf::ParallelOp convertACCLoopToSCFParallel(LoopOp loopOp, OpBuilder &builder);
+
+/// Convert an unstructured acc.loop to scf.execute_region.
+/// @param loopOp The acc.loop operation to convert (must be unstructured)
+/// @param builder OpBuilder for creating operations
+/// @return The created scf.execute_region operation or nullptr on creation
+/// error. An InFlightDiagnostic is emitted on creation error.
+scf::ExecuteRegionOp
+convertUnstructuredACCLoopToSCFExecuteRegion(LoopOp loopOp, OpBuilder &builder);
+
+} // namespace acc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
diff --git a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
index c7c322be70d09..532ba90355b44 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
@@ -1,6 +1,7 @@
add_mlir_dialect_library(MLIROpenACCUtils
- OpenACCUtilsTiling.cpp
OpenACCUtils.cpp
+ OpenACCUtilsLoop.cpp
+ OpenACCUtilsTiling.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC
@@ -19,6 +20,8 @@ add_mlir_dialect_library(MLIROpenACCUtils
MLIRArithUtils
MLIROpenACCDialect
MLIRIR
+ MLIRSCFDialect
+ MLIRSCFUtils
MLIRSupport
MLIRTransformUtils
)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
new file mode 100644
index 0000000000000..fe0707320cc79
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
@@ -0,0 +1,323 @@
+//===- OpenACCUtilsLoop.cpp - OpenACC Loop Utilities ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for converting OpenACC loops to SCF.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/IR/IRMapping.h"
+
+using namespace mlir;
+
+namespace {
+
+/// Calculate trip count for a loop: max(0, (ub - lb + step) / step)
+/// If inclusiveUpperbound is true, uses ub as-is; otherwise subtracts 1.
+/// The result is clamped to 0 to handle cases where lb > ub for positive step
+/// (or lb < ub for negative step), which would result in a negative trip count.
+static Value calculateTripCount(OpBuilder &b, Location loc, Value lb, Value ub,
+ Value step, bool inclusiveUpperbound) {
+ Type type = b.getIndexType();
+
+ // Convert original loop arguments to index type
+ lb = getValueOrCreateCastToIndexLike(b, loc, type, lb);
+ ub = getValueOrCreateCastToIndexLike(b, loc, type, ub);
+ step = getValueOrCreateCastToIndexLike(b, loc, type, step);
+
+ if (!inclusiveUpperbound) {
+ Value one = arith::ConstantIndexOp::create(b, loc, 1);
+ ub = b.createOrFold<arith::SubIOp>(loc, ub, one);
+ }
+
+ Value sub = b.createOrFold<arith::SubIOp>(loc, ub, lb);
+ Value add = b.createOrFold<arith::AddIOp>(loc, sub, step);
+ Value unclampedTrips = b.createOrFold<arith::DivSIOp>(loc, add, step);
+
+ // Clamp negative trip counts to 0
+ Value zero = arith::ConstantIndexOp::create(b, loc, 0);
+ Value isNegative = b.createOrFold<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::slt, unclampedTrips, zero);
+ Value trips =
+ b.createOrFold<arith::SelectOp>(loc, isNegative, zero, unclampedTrips);
+ return trips;
+}
+
+/// Get exclusive upper bound from acc.loop (add 1 if inclusive).
+static Value getExclusiveUpperBound(acc::LoopOp loopOp, size_t ivPos,
+ OpBuilder &b) {
+ bool isInclusive = false;
+ if (loopOp.getInclusiveUpperbound().has_value())
+ isInclusive = loopOp.getInclusiveUpperboundAttr().asArrayRef()[ivPos];
+
+ Value origUB = loopOp.getUpperbound()[ivPos];
+ if (isInclusive) {
+ Location loc = origUB.getLoc();
+ Value one;
+ Type ubType = origUB.getType();
+ if (ubType.isIndex())
+ one = arith::ConstantIndexOp::create(b, loc, 1);
+ else
+ one = arith::ConstantIntOp::create(b, loc, ubType, 1);
+ return b.createOrFold<arith::AddIOp>(loc, origUB, one);
+ }
+ return origUB;
+}
+
+/// Handle differing types between SCF (index) and ACC loops.
+/// Creates casts from the new SCF IVs to the original ACC IV types and updates
+/// the mapping. The newIVs should correspond 1:1 with the ACC loop's IVs.
+static void mapACCLoopIVsToSCFIVs(acc::LoopOp accLoop, ValueRange newIVs,
+ OpBuilder &b, IRMapping &mapping) {
+ for (auto [origIV, newIV] :
+ llvm::zip(accLoop.getBody().getArguments(), newIVs)) {
+ Value replacementIV = getValueOrCreateCastToIndexLike(
+ b, accLoop->getLoc(), origIV.getType(), newIV);
+ mapping.map(origIV, replacementIV);
+ }
+}
+
+/// Normalize IV uses after converting to normalized loop form.
+/// For normalized loops (lb=0, step=1), we need to denormalize the IV:
+/// original_iv = new_iv * orig_step + orig_lb
+static void normalizeIVUses(OpBuilder &b, Location loc, Value iv, Value origLB,
+ Value origStep) {
+ Type indexType = b.getIndexType();
+ Value lb = getValueOrCreateCastToIndexLike(b, loc, indexType, origLB);
+ Value step = getValueOrCreateCastToIndexLike(b, loc, indexType, origStep);
+
+ // new_iv * step + lb
+ Value scaled = arith::MulIOp::create(b, loc, iv, step);
+ Value denormalized = arith::AddIOp::create(b, loc, scaled, lb);
+
+ // Replace uses of iv with denormalized value, except for the ops that
+ // compute the denormalized value itself (muli and addi)
+ llvm::SmallPtrSet<Operation *, 2> exceptions;
+ exceptions.insert(scaled.getDefiningOp());
+ exceptions.insert(denormalized.getDefiningOp());
+ iv.replaceAllUsesExcept(denormalized, exceptions);
+}
+
+/// Clone an ACC region into a destination block, handling the ACC terminators.
+/// Returns the insertion point after the cloned operations.
+static Block::iterator cloneACCRegionInto(Region *src, Block *dest,
+ Block::iterator insertionPoint,
+ IRMapping &mapping) {
+ assert(src->hasOneBlock() && "expected single-block region");
+
+ Region *insertRegion = dest->getParent();
+ Block *postInsertBlock = dest->splitBlock(insertionPoint);
+ src->cloneInto(insertRegion, postInsertBlock->getIterator(), mapping);
+
+ auto lastNewBlock = std::prev(postInsertBlock->getIterator());
+
+ Block::iterator ip;
+ Operation *terminator = lastNewBlock->getTerminator();
+
+ if (auto yieldOp = dyn_cast<acc::YieldOp>(terminator)) {
+ ip = std::prev(yieldOp->getIterator());
+ yieldOp.erase();
+ } else if (auto terminatorOp = dyn_cast<acc::TerminatorOp>(terminator)) {
+ ip = std::prev(terminatorOp->getIterator());
+ terminatorOp.erase();
+ } else {
+ llvm_unreachable("unexpected terminator in ACC region");
+ }
+
+ // Merge last block with the postInsertBlock
+ lastNewBlock->getOperations().splice(lastNewBlock->end(),
+ postInsertBlock->getOperations());
+ postInsertBlock->erase();
+
+ // Merge first block with original dest block
+ auto firstNewBlock = std::next(dest->getIterator());
+ dest->getOperations().splice(dest->end(), firstNewBlock->getOperations());
+ firstNewBlock->erase();
+
+ return ip;
+}
+
+/// Wrap a multi-block region with scf.execute_region.
+static scf::ExecuteRegionOp
+wrapMultiBlockRegionWithSCFExecuteRegion(Region ®ion, IRMapping &mapping,
+ Location loc, OpBuilder &b) {
+ auto exeRegionOp = scf::ExecuteRegionOp::create(b, loc, TypeRange{});
+
+ b.cloneRegionBefore(region, exeRegionOp.getRegion(),
+ exeRegionOp.getRegion().end(), mapping);
+
+ // Find and replace the ACC terminator with scf.yield
+ Operation *terminator = exeRegionOp.getRegion().back().getTerminator();
+ if (auto yieldOp = dyn_cast<acc::YieldOp>(terminator)) {
+ if (yieldOp.getNumOperands() > 0) {
+ region.getParentOp()->emitError(
+ "acc.loop with results not yet supported");
+ return nullptr;
+ }
+ terminator->erase();
+ } else if (auto accTerminator = dyn_cast<acc::TerminatorOp>(terminator)) {
+ terminator->erase();
+ } else {
+ llvm_unreachable("unexpected terminator in ACC region");
+ }
+
+ b.setInsertionPointToEnd(&exeRegionOp.getRegion().back());
+ scf::YieldOp::create(b, loc);
+ return exeRegionOp;
+}
+
+} // namespace
+
+namespace mlir {
+namespace acc {
+
+scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, bool enableCollapse) {
+ assert(!loopOp.getUnstructured() &&
+ "use convertUnstructuredACCLoopToSCFExecuteRegion for unstructured "
+ "loops");
+
+ OpBuilder b(loopOp);
+
+ // Lambda to create an scf::ForOp for a single dimension of the acc.loop
+ auto createSCFForOp = [&](acc::LoopOp accLoopOp, size_t idx, OpBuilder &b,
+ OpBuilder &nestBuilder) -> scf::ForOp {
+ assert(idx < accLoopOp.getBody().getNumArguments());
+
+ Location loc = accLoopOp->getLoc();
+ Type indexType = b.getIndexType();
+
+ Value newLowerBound = getValueOrCreateCastToIndexLike(
+ b, loc, indexType, accLoopOp.getLowerbound()[idx]);
+ Value newUpperBound = getValueOrCreateCastToIndexLike(
+ b, loc, indexType, getExclusiveUpperBound(accLoopOp, idx, b));
+ Value newStep = getValueOrCreateCastToIndexLike(b, loc, indexType,
+ accLoopOp.getStep()[idx]);
+
+ return scf::ForOp::create(nestBuilder, loc, newLowerBound, newUpperBound,
+ newStep);
+ };
+
+ // Create nested scf.for loops and build IR mapping for IVs
+ IRMapping mapping;
+ SmallVector<scf::ForOp, 4> forOps;
+ b.setInsertionPoint(loopOp);
+ OpBuilder nestBuilder(loopOp);
+
+ for (BlockArgument iv : loopOp.getBody().getArguments()) {
+ size_t idx = iv.getArgNumber();
+ scf::ForOp forOp = createSCFForOp(loopOp, idx, b, nestBuilder);
+ forOps.push_back(forOp);
+ mapping.map(iv, forOp.getInductionVar());
+
+ // The "outside" builder stays before the outer loop
+ if (idx == 0)
+ b.setInsertionPoint(forOp);
+
+ // The "inside" builder moves into each new loop
+ nestBuilder.setInsertionPointToStart(forOp.getBody());
+ }
+
+ // Handle IV type conversion (index -> original type)
+ SmallVector<Value, 4> scfIVs;
+ for (scf::ForOp forOp : forOps)
+ scfIVs.push_back(forOp.getInductionVar());
+ mapACCLoopIVsToSCFIVs(loopOp, scfIVs, nestBuilder, mapping);
+
+ // Clone the loop body into the innermost scf.for
+ cloneACCRegionInto(&loopOp.getRegion(), forOps.back().getBody(),
+ nestBuilder.getInsertionPoint(), mapping);
+
+ // Optionally collapse nested loops
+ if (enableCollapse && forOps.size() > 1)
+ if (failed(coalesceLoops(forOps)))
+ loopOp.emitError("failed to collapse acc.loop");
+
+ return forOps.front();
+}
+
+scf::ParallelOp convertACCLoopToSCFParallel(LoopOp loopOp, OpBuilder &b) {
+ assert(!loopOp.getUnstructured() &&
+ "use convertUnstructuredACCLoopToSCFExecuteRegion for unstructured "
+ "loops");
+ assert(b.getInsertionBlock() &&
+ !loopOp->isProperAncestor(b.getInsertionBlock()->getParentOp()) &&
+ "builder insertion point must not be inside the loop being converted");
+
+ Location loc = loopOp->getLoc();
+
+ SmallVector<Value, 4> lowerBounds, upperBounds, steps;
+
+ // Normalize all loops: lb=0, step=1, ub=tripCount
+ Value lb = arith::ConstantIndexOp::create(b, loc, 0);
+ Value step = arith::ConstantIndexOp::create(b, loc, 1);
+
+ for (auto [idx, iv] : llvm::enumerate(loopOp.getBody().getArguments())) {
+ bool inclusiveUpperbound = false;
+ if (loopOp.getInclusiveUpperbound().has_value())
+ inclusiveUpperbound = loopOp.getInclusiveUpperbound().value()[idx];
+
+ Value ub = calculateTripCount(b, loc, loopOp.getLowerbound()[idx],
+ loopOp.getUpperbound()[idx],
+ loopOp.getStep()[idx], inclusiveUpperbound);
+
+ lowerBounds.push_back(lb);
+ upperBounds.push_back(ub);
+ steps.push_back(step);
+ }
+
+ auto parallelOp =
+ scf::ParallelOp::create(b, loc, lowerBounds, upperBounds, steps);
+
+ // Create IV type conversions
+ IRMapping mapping;
+ b.setInsertionPointToStart(parallelOp.getBody());
+ mapACCLoopIVsToSCFIVs(loopOp, parallelOp.getInductionVars(), b, mapping);
+
+ if (!loopOp.getRegion().hasOneBlock()) {
+ auto exeRegion = wrapMultiBlockRegionWithSCFExecuteRegion(
+ loopOp.getRegion(), mapping, loc, b);
+ if (!exeRegion) {
+ parallelOp.erase();
+ return nullptr;
+ }
+ } else {
+ cloneACCRegionInto(&loopOp.getRegion(), parallelOp.getBody(),
+ b.getInsertionPoint(), mapping);
+ }
+
+ // Denormalize IV uses
+ b.setInsertionPointToStart(parallelOp.getBody());
+ for (auto [idx, iv] : llvm::enumerate(parallelOp.getBody()->getArguments()))
+ if (!iv.use_empty())
+ normalizeIVUses(b, loc, iv, loopOp.getLowerbound()[idx],
+ loopOp.getStep()[idx]);
+
+ return parallelOp;
+}
+
+scf::ExecuteRegionOp
+convertUnstructuredACCLoopToSCFExecuteRegion(LoopOp loopOp, OpBuilder &b) {
+ assert(loopOp.getUnstructured() &&
+ "use convertACCLoopToSCFFor for structured loops");
+ assert(b.getInsertionBlock() &&
+ !loopOp->isProperAncestor(b.getInsertionBlock()->getParentOp()) &&
+ "builder insertion point must not be inside the loop being converted");
+
+ IRMapping mapping;
+ return wrapMultiBlockRegionWithSCFExecuteRegion(loopOp.getRegion(), mapping,
+ loopOp->getLoc(), b);
+}
+
+} // namespace acc
+} // namespace mlir
diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
index 060c8b8d2679d..29448d2af5537 100644
--- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
@@ -3,13 +3,17 @@ add_mlir_unittest(MLIROpenACCTests
OpenACCOpsInterfacesTest.cpp
OpenACCUtilsTest.cpp
OpenACCUtilsTilingTest.cpp
+ OpenACCUtilsLoopTest.cpp
)
mlir_target_link_libraries(MLIROpenACCTests
PRIVATE
MLIRIR
+ MLIRAffineDialect
MLIRFuncDialect
MLIRMemRefDialect
MLIRArithDialect
MLIROpenACCDialect
MLIROpenACCUtils
+ MLIRSCFDialect
+ MLIRControlFlowDialect
)
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp
new file mode 100644
index 0000000000000..e23ff2049ca37
--- /dev/null
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp
@@ -0,0 +1,597 @@
+//===- OpenACCUtilsLoopTest.cpp - Unit tests for OpenACC loop utilities --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Value.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+class OpenACCUtilsLoopTest : public ::testing::Test {
+protected:
+ OpenACCUtilsLoopTest() : b(&context), loc(UnknownLoc::get(&context)) {
+ context.loadDialect<acc::OpenACCDialect, affine::AffineDialect,
+ arith::ArithDialect, memref::MemRefDialect,
+ func::FuncDialect, scf::SCFDialect,
+ cf::ControlFlowDialect>();
+ }
+
+ /// Helper to create an index constant
+ Value createIndexConstant(int64_t value) {
+ return arith::ConstantOp::create(b, loc, b.getIndexType(),
+ b.getIndexAttr(value));
+ }
+
+ /// Helper to create an i32 constant
+ Value createI32Constant(int32_t value) {
+ return arith::ConstantOp::create(b, loc, b.getI32Type(),
+ b.getI32IntegerAttr(value));
+ }
+
+ /// Helper to create a simple acc.loop with the given bounds.
+ /// Preserves the builder's insertion point.
+ acc::LoopOp createLoopOp(ValueRange lbs, ValueRange ubs, ValueRange steps,
+ bool inclusiveUpperbound = true) {
+ OpBuilder::InsertionGuard guard(b);
+
+ auto loopOp = acc::LoopOp::create(b, loc, lbs, ubs, steps,
+ acc::LoopParMode::loop_independent);
+
+ // Set inclusive upper bound attribute
+ SmallVector<bool> inclusiveFlags(lbs.size(), inclusiveUpperbound);
+ loopOp.setInclusiveUpperboundAttr(b.getDenseBoolArrayAttr(inclusiveFlags));
+
+ // Add body block with IV arguments and yield
+ Region ®ion = loopOp.getRegion();
+ Block *block = b.createBlock(®ion, region.begin());
+ for (Value lb : lbs)
+ block->addArgument(lb.getType(), loc);
+ b.setInsertionPointToEnd(block);
+ acc::YieldOp::create(b, loc);
+
+ return loopOp;
+ }
+
+ /// Helper to create an unstructured acc.loop with multiple blocks and ops.
+ /// Preserves the builder's insertion point.
+ acc::LoopOp createUnstructuredLoopOp(ValueRange lbs, ValueRange ubs,
+ ValueRange steps) {
+ OpBuilder::InsertionGuard guard(b);
+
+ auto loopOp = acc::LoopOp::create(b, loc, lbs, ubs, steps,
+ acc::LoopParMode::loop_independent);
+ loopOp.setInclusiveUpperboundAttr(
+ b.getDenseBoolArrayAttr(SmallVector<bool>(lbs.size(), true)));
+ loopOp.setUnstructuredAttr(b.getUnitAttr());
+
+ // Create 4 blocks with control flow to test proper replication
+ Region ®ion = loopOp.getRegion();
+ Block *entry = b.createBlock(®ion, region.begin());
+ Block *thenBlock = b.createBlock(®ion, region.end());
+ Block *elseBlock = b.createBlock(®ion, region.end());
+ Block *exitBlock = b.createBlock(®ion, region.end());
+
+ // Entry block: create a condition and conditional branch
+ b.setInsertionPointToEnd(entry);
+ Value cond =
+ arith::ConstantOp::create(b, loc, b.getI1Type(), b.getBoolAttr(true));
+ cf::CondBranchOp::create(b, loc, cond, thenBlock, elseBlock);
+
+ // Then block: create an arith op and branch to exit
+ b.setInsertionPointToEnd(thenBlock);
+ Value c1 =
+ arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+ Value c2 =
+ arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(2));
+ arith::AddIOp::create(b, loc, c1, c2);
+ cf::BranchOp::create(b, loc, exitBlock);
+
+ // Else block: create a different arith op and branch to exit
+ b.setInsertionPointToEnd(elseBlock);
+ Value c3 =
+ arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(3));
+ Value c4 =
+ arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(4));
+ arith::MulIOp::create(b, loc, c3, c4);
+ cf::BranchOp::create(b, loc, exitBlock);
+
+ // Exit block: yield
+ b.setInsertionPointToEnd(exitBlock);
+ acc::YieldOp::create(b, loc);
+
+ return loopOp;
+ }
+
+ /// Create a module with a function and set the insertion point in it
+ std::pair<OwningOpRef<ModuleOp>, func::FuncOp> createModuleWithFunc() {
+ OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+ b.setInsertionPointToStart(module->getBody());
+
+ auto funcType = b.getFunctionType({}, {});
+ auto funcOp = func::FuncOp::create(b, loc, "test_func", funcType);
+ Block *entryBlock = funcOp.addEntryBlock();
+ b.setInsertionPointToStart(entryBlock);
+
+ return {std::move(module), funcOp};
+ }
+
+ /// Create a module with a function that has arguments
+ std::pair<OwningOpRef<ModuleOp>, func::FuncOp>
+ createModuleWithFuncArgs(TypeRange argTypes) {
+ OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+ b.setInsertionPointToStart(module->getBody());
+
+ auto funcType = b.getFunctionType(argTypes, {});
+ auto funcOp = func::FuncOp::create(b, loc, "test_func", funcType);
+ Block *entryBlock = funcOp.addEntryBlock();
+ b.setInsertionPointToStart(entryBlock);
+
+ return {std::move(module), funcOp};
+ }
+
+ /// Helper to extract constant index value from a Value
+ std::optional<int64_t> getConstantIndex(Value v) {
+ if (auto constOp = v.getDefiningOp<arith::ConstantIndexOp>())
+ return constOp.value();
+ if (auto constOp = v.getDefiningOp<arith::ConstantOp>()) {
+ if (auto intAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
+ return intAttr.getInt();
+ }
+ return std::nullopt;
+ }
+
+ MLIRContext context;
+ OpBuilder b;
+ Location loc;
+};
+
+//===----------------------------------------------------------------------===//
+// convertACCLoopToSCFFor Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsLoopTest, ConvertSimpleLoopToSCFFor) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createLoopOp({c0}, {c10}, {c1});
+ scf::ForOp forOp = convertACCLoopToSCFFor(loopOp, /*enableCollapse=*/false);
+
+ ASSERT_TRUE(forOp);
+
+ // Verify IV type is index
+ EXPECT_TRUE(forOp.getInductionVar().getType().isIndex());
+
+ // Verify bounds: lb=0, ub=11 (folded from 10+1), step=1
+ auto lbConst = getConstantIndex(forOp.getLowerBound());
+ ASSERT_TRUE(lbConst.has_value());
+ EXPECT_EQ(*lbConst, 0);
+
+ auto ubConst = getConstantIndex(forOp.getUpperBound());
+ ASSERT_TRUE(ubConst.has_value());
+ EXPECT_EQ(*ubConst, 11); // inclusive 10 becomes exclusive 11
+
+ auto stepConst = getConstantIndex(forOp.getStep());
+ ASSERT_TRUE(stepConst.has_value());
+ EXPECT_EQ(*stepConst, 1);
+
+ // Verify the body has a yield terminator
+ EXPECT_TRUE(isa<scf::YieldOp>(forOp.getBody()->getTerminator()));
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopWithI32Bounds) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value lb = createI32Constant(0);
+ Value ub = createI32Constant(100);
+ Value step = createI32Constant(1);
+
+ acc::LoopOp loopOp = createLoopOp({lb}, {ub}, {step});
+ scf::ForOp forOp = convertACCLoopToSCFFor(loopOp, /*enableCollapse=*/false);
+
+ ASSERT_TRUE(forOp);
+
+ // IV type should be converted to index
+ EXPECT_TRUE(forOp.getInductionVar().getType().isIndex());
+
+ // Bounds should be cast to index type
+ EXPECT_TRUE(forOp.getLowerBound().getType().isIndex());
+ EXPECT_TRUE(forOp.getUpperBound().getType().isIndex());
+ EXPECT_TRUE(forOp.getStep().getType().isIndex());
+
+ // Verify the body has a yield terminator
+ EXPECT_TRUE(isa<scf::YieldOp>(forOp.getBody()->getTerminator()));
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopWithNonConstantBounds) {
+ auto [module, funcOp] =
+ createModuleWithFuncArgs({b.getIndexType(), b.getIndexType()});
+ Block &entryBlock = funcOp.getBody().front();
+
+ Value lb = entryBlock.getArgument(0);
+ Value ub = entryBlock.getArgument(1);
+ Value step = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createLoopOp({lb}, {ub}, {step});
+ scf::ForOp forOp = convertACCLoopToSCFFor(loopOp, /*enableCollapse=*/false);
+
+ ASSERT_TRUE(forOp);
+
+ // Lower bound should be the function argument (no cast needed for index)
+ EXPECT_EQ(forOp.getLowerBound(), lb);
+
+ // Upper bound should be ub + 1 (for inclusive -> exclusive conversion)
+ // Check it's an addi of ub and 1
+ auto ubAddOp = forOp.getUpperBound().getDefiningOp<arith::AddIOp>();
+ ASSERT_TRUE(ubAddOp);
+ EXPECT_EQ(ubAddOp.getLhs(), ub);
+ auto oneConst = getConstantIndex(ubAddOp.getRhs());
+ ASSERT_TRUE(oneConst.has_value());
+ EXPECT_EQ(*oneConst, 1);
+
+ // Step should be the constant 1
+ EXPECT_EQ(forOp.getStep(), step);
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopToSCFForWithCollapse) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createLoopOp({c0, c0}, {c10, c10}, {c1, c1});
+ scf::ForOp forOp = convertACCLoopToSCFFor(loopOp, /*enableCollapse=*/true);
+
+ ASSERT_TRUE(forOp);
+
+ // With collapse, there should be NO nested for loops
+ bool hasNestedFor = false;
+ forOp.getBody()->walk([&](scf::ForOp) { hasNestedFor = true; });
+ EXPECT_FALSE(hasNestedFor);
+
+ // The collapsed loop should iterate over the product of dimensions
+ // lb=0, step=1 (after collapsing two 0..10 inclusive loops)
+ auto lbConst = getConstantIndex(forOp.getLowerBound());
+ ASSERT_TRUE(lbConst.has_value());
+ EXPECT_EQ(*lbConst, 0);
+
+ auto stepConst = getConstantIndex(forOp.getStep());
+ ASSERT_TRUE(stepConst.has_value());
+ EXPECT_EQ(*stepConst, 1);
+
+ // Upper bound should be 11*11=121 (product of trip counts)
+ // coalesceLoops normalizes the loops, so ub = totalTripCount
+ EXPECT_TRUE(forOp.getUpperBound().getType().isIndex());
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopToSCFForNoCollapse) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createLoopOp({c0, c0}, {c10, c10}, {c1, c1});
+ scf::ForOp forOp = convertACCLoopToSCFFor(loopOp, /*enableCollapse=*/false);
+
+ ASSERT_TRUE(forOp);
+
+ bool hasNestedFor = false;
+ forOp.getBody()->walk([&](scf::ForOp) { hasNestedFor = true; });
+ EXPECT_TRUE(hasNestedFor);
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopToSCFForExclusiveUpperBound) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp =
+ createLoopOp({c0}, {c10}, {c1}, /*inclusiveUpperbound=*/false);
+ scf::ForOp forOp = convertACCLoopToSCFFor(loopOp, /*enableCollapse=*/false);
+
+ ASSERT_TRUE(forOp);
+
+ // With exclusive upper bound, ub should remain 10 (no +1 adjustment)
+ EXPECT_EQ(forOp.getLowerBound(), c0);
+ EXPECT_EQ(forOp.getUpperBound(), c10);
+ EXPECT_EQ(forOp.getStep(), c1);
+}
+
+//===----------------------------------------------------------------------===//
+// convertACCLoopToSCFParallel Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsLoopTest, ConvertSimpleLoopToSCFParallel) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createLoopOp({c0}, {c10}, {c1});
+ scf::ParallelOp parallelOp = convertACCLoopToSCFParallel(loopOp, b);
+
+ ASSERT_TRUE(parallelOp);
+ EXPECT_EQ(parallelOp.getNumLoops(), 1u);
+
+ // scf.parallel uses normalized bounds: lb=0, step=1, ub=tripCount
+ auto lb = getConstantIndex(parallelOp.getLowerBound()[0]);
+ auto step = getConstantIndex(parallelOp.getStep()[0]);
+ auto ub = getConstantIndex(parallelOp.getUpperBound()[0]);
+ ASSERT_TRUE(lb.has_value());
+ ASSERT_TRUE(step.has_value());
+ ASSERT_TRUE(ub.has_value());
+ EXPECT_EQ(*lb, 0);
+ EXPECT_EQ(*step, 1);
+ EXPECT_EQ(*ub, 11); // trip count for 0..10 inclusive with step 1
+
+ // Verify IVs are index type
+ EXPECT_EQ(parallelOp.getInductionVars().size(), 1u);
+ EXPECT_TRUE(parallelOp.getInductionVars()[0].getType().isIndex());
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopWithI32BoundsToSCFParallel) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value lb = createI32Constant(5);
+ Value ub = createI32Constant(15);
+ Value step = createI32Constant(2);
+
+ acc::LoopOp loopOp = createLoopOp({lb}, {ub}, {step});
+ scf::ParallelOp parallelOp = convertACCLoopToSCFParallel(loopOp, b);
+
+ ASSERT_TRUE(parallelOp);
+ EXPECT_EQ(parallelOp.getNumLoops(), 1u);
+
+ // All bounds should be index type (converted from i32)
+ EXPECT_TRUE(parallelOp.getLowerBound()[0].getType().isIndex());
+ EXPECT_TRUE(parallelOp.getUpperBound()[0].getType().isIndex());
+ EXPECT_TRUE(parallelOp.getStep()[0].getType().isIndex());
+
+ // Normalized: lb=0, step=1
+ // Note: ub is trip count but not folded because index_cast prevents folding
+ auto lbConst = getConstantIndex(parallelOp.getLowerBound()[0]);
+ auto stepConst = getConstantIndex(parallelOp.getStep()[0]);
+ ASSERT_TRUE(lbConst.has_value());
+ ASSERT_TRUE(stepConst.has_value());
+ EXPECT_EQ(*lbConst, 0);
+ EXPECT_EQ(*stepConst, 1);
+
+ // Verify IVs are index type
+ EXPECT_TRUE(parallelOp.getInductionVars()[0].getType().isIndex());
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopWithNonConstantBoundsToSCFParallel) {
+ auto [module, funcOp] = createModuleWithFuncArgs(
+ {b.getIndexType(), b.getIndexType(), b.getIndexType()});
+ Block &entryBlock = funcOp.getBody().front();
+
+ Value lb = entryBlock.getArgument(0);
+ Value ub = entryBlock.getArgument(1);
+ Value step = entryBlock.getArgument(2);
+
+ acc::LoopOp loopOp = createLoopOp({lb}, {ub}, {step});
+ scf::ParallelOp parallelOp = convertACCLoopToSCFParallel(loopOp, b);
+
+ ASSERT_TRUE(parallelOp);
+ EXPECT_EQ(parallelOp.getNumLoops(), 1u);
+
+ // Normalized: lb=0, step=1
+ auto lbConst = getConstantIndex(parallelOp.getLowerBound()[0]);
+ auto stepConst = getConstantIndex(parallelOp.getStep()[0]);
+ ASSERT_TRUE(lbConst.has_value());
+ ASSERT_TRUE(stepConst.has_value());
+ EXPECT_EQ(*lbConst, 0);
+ EXPECT_EQ(*stepConst, 1);
+
+ // Upper bound should be computed trip count (not a constant)
+ // Verify it's not a simple constant (since bounds are dynamic)
+ EXPECT_FALSE(getConstantIndex(parallelOp.getUpperBound()[0]).has_value());
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertMultiDimLoopToSCFParallel) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createLoopOp({c0, c0}, {c10, c10}, {c1, c1});
+ scf::ParallelOp parallelOp = convertACCLoopToSCFParallel(loopOp, b);
+
+ ASSERT_TRUE(parallelOp);
+ EXPECT_EQ(parallelOp.getNumLoops(), 2u);
+
+ // Both dimensions should have normalized lb=0, step=1, ub=11
+ for (unsigned i = 0; i < 2; ++i) {
+ auto lb = getConstantIndex(parallelOp.getLowerBound()[i]);
+ auto step = getConstantIndex(parallelOp.getStep()[i]);
+ auto ub = getConstantIndex(parallelOp.getUpperBound()[i]);
+
+ ASSERT_TRUE(lb.has_value());
+ ASSERT_TRUE(step.has_value());
+ ASSERT_TRUE(ub.has_value());
+
+ EXPECT_EQ(*lb, 0);
+ EXPECT_EQ(*step, 1);
+ EXPECT_EQ(*ub, 11); // 0..10 inclusive = 11 iterations
+ }
+
+ // Should have 2 induction variables
+ EXPECT_EQ(parallelOp.getInductionVars().size(), 2u);
+ EXPECT_TRUE(parallelOp.getInductionVars()[0].getType().isIndex());
+ EXPECT_TRUE(parallelOp.getInductionVars()[1].getType().isIndex());
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertLoopWithLargeStepToSCFParallel) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value lb = createIndexConstant(0);
+ Value ub = createIndexConstant(100);
+ Value step = createIndexConstant(10);
+
+ acc::LoopOp loopOp = createLoopOp({lb}, {ub}, {step});
+ scf::ParallelOp parallelOp = convertACCLoopToSCFParallel(loopOp, b);
+
+ ASSERT_TRUE(parallelOp);
+ EXPECT_EQ(parallelOp.getNumLoops(), 1u);
+
+ // Normalized: lb=0, step=1, ub=tripCount
+ auto lbConst = getConstantIndex(parallelOp.getLowerBound()[0]);
+ auto stepConst = getConstantIndex(parallelOp.getStep()[0]);
+ auto ubConst = getConstantIndex(parallelOp.getUpperBound()[0]);
+ ASSERT_TRUE(lbConst.has_value());
+ ASSERT_TRUE(stepConst.has_value());
+ ASSERT_TRUE(ubConst.has_value());
+ EXPECT_EQ(*lbConst, 0);
+ EXPECT_EQ(*stepConst, 1);
+ EXPECT_EQ(*ubConst, 11); // trip count for 0..100 inclusive with step 10
+
+ // Verify IV is index type
+ EXPECT_TRUE(parallelOp.getInductionVars()[0].getType().isIndex());
+}
+
+//===----------------------------------------------------------------------===//
+// convertUnstructuredACCLoopToSCFExecuteRegion Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsLoopTest, ConvertUnstructuredLoopToExecuteRegion) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createUnstructuredLoopOp({c0}, {c10}, {c1});
+
+ // Verify the source loop has 4 blocks
+ EXPECT_EQ(loopOp.getRegion().getBlocks().size(), 4u);
+
+ scf::ExecuteRegionOp exeRegionOp =
+ convertUnstructuredACCLoopToSCFExecuteRegion(loopOp, b);
+
+ ASSERT_TRUE(exeRegionOp);
+
+ // The execute_region should have 4 blocks replicated from the source
+ EXPECT_EQ(exeRegionOp.getRegion().getBlocks().size(), 4u);
+
+ // Verify that the control flow structure is preserved:
+ Block &entryBlock = exeRegionOp.getRegion().front();
+ EXPECT_TRUE(isa<cf::CondBranchOp>(entryBlock.getTerminator()));
+
+ Block &exitBlock = exeRegionOp.getRegion().back();
+ EXPECT_TRUE(isa<scf::YieldOp>(exitBlock.getTerminator()));
+
+ // Count arith operations to verify body was cloned correctly
+ unsigned addCount = 0;
+ unsigned mulCount = 0;
+ exeRegionOp.getRegion().walk([&](arith::AddIOp) { ++addCount; });
+ exeRegionOp.getRegion().walk([&](arith::MulIOp) { ++mulCount; });
+ EXPECT_EQ(addCount, 1u);
+ EXPECT_EQ(mulCount, 1u);
+}
+
+TEST_F(OpenACCUtilsLoopTest, ConvertUnstructuredLoopPreservesSuccessors) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ acc::LoopOp loopOp = createUnstructuredLoopOp({c0}, {c10}, {c1});
+ scf::ExecuteRegionOp exeRegionOp =
+ convertUnstructuredACCLoopToSCFExecuteRegion(loopOp, b);
+
+ ASSERT_TRUE(exeRegionOp);
+
+ Block &entryBlock = exeRegionOp.getRegion().front();
+ auto condBranch = dyn_cast<cf::CondBranchOp>(entryBlock.getTerminator());
+ ASSERT_TRUE(condBranch);
+
+ // Both successors should exist in the region
+ Block *trueDest = condBranch.getTrueDest();
+ Block *falseDest = condBranch.getFalseDest();
+ EXPECT_TRUE(trueDest->getParent() == &exeRegionOp.getRegion());
+ EXPECT_TRUE(falseDest->getParent() == &exeRegionOp.getRegion());
+}
+
+//===----------------------------------------------------------------------===//
+// Error Case Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsLoopTest, UnstructuredLoopWithYieldOperandsReturnsNullptr) {
+ auto [module, funcOp] = createModuleWithFunc();
+
+ Value c0 = createIndexConstant(0);
+ Value c10 = createIndexConstant(10);
+ Value c1 = createIndexConstant(1);
+
+ // Create an unstructured loop where the yield has operands (simulating
+ // a loop with results, which is not yet supported)
+ auto loopOp = acc::LoopOp::create(b, loc, {c0}, {c10}, {c1},
+ acc::LoopParMode::loop_independent);
+ loopOp.setInclusiveUpperboundAttr(b.getDenseBoolArrayAttr({true}));
+ loopOp.setUnstructuredAttr(b.getUnitAttr());
+
+ // Create multi-block body with yield that has operands
+ {
+ OpBuilder::InsertionGuard guard(b);
+ Region ®ion = loopOp.getRegion();
+ Block *entry = b.createBlock(®ion, region.begin());
+ Block *exitBlock = b.createBlock(®ion, region.end());
+
+ b.setInsertionPointToEnd(entry);
+ cf::BranchOp::create(b, loc, exitBlock);
+
+ b.setInsertionPointToEnd(exitBlock);
+ // Create a yield with operands - this triggers the error
+ Value result = createI32Constant(42);
+ acc::YieldOp::create(b, loc, ValueRange{result});
+ }
+ // InsertionGuard restores insertion point to after loopOp
+
+ // Use a diagnostic handler to capture the error
+ std::string errorMsg;
+ ScopedDiagnosticHandler handler(&context, [&](Diagnostic &diag) {
+ if (diag.getSeverity() == DiagnosticSeverity::Error) {
+ llvm::raw_string_ostream os(errorMsg);
+ os << diag;
+ }
+ return success();
+ });
+
+ scf::ExecuteRegionOp exeRegionOp =
+ convertUnstructuredACCLoopToSCFExecuteRegion(loopOp, b);
+
+ // Should return nullptr due to unsupported loop with results
+ EXPECT_FALSE(exeRegionOp);
+ EXPECT_TRUE(errorMsg.find("not yet supported") != std::string::npos);
+}
>From 8bb04f1b77397be204e1a8051a728cee70dabe6b Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Fri, 19 Dec 2025 08:05:03 -0800
Subject: [PATCH 2/7] Remove zero clamping code
---
.../Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp | 16 +++-------------
1 file changed, 3 insertions(+), 13 deletions(-)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
index fe0707320cc79..6e26f0ff20ed8 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
@@ -23,10 +23,8 @@ using namespace mlir;
namespace {
-/// Calculate trip count for a loop: max(0, (ub - lb + step) / step)
-/// If inclusiveUpperbound is true, uses ub as-is; otherwise subtracts 1.
-/// The result is clamped to 0 to handle cases where lb > ub for positive step
-/// (or lb < ub for negative step), which would result in a negative trip count.
+/// Calculate trip count for a loop: (ub - lb + step) / step
+/// If inclusiveUpperbound is false, subtracts 1 from ub first.
static Value calculateTripCount(OpBuilder &b, Location loc, Value lb, Value ub,
Value step, bool inclusiveUpperbound) {
Type type = b.getIndexType();
@@ -43,15 +41,7 @@ static Value calculateTripCount(OpBuilder &b, Location loc, Value lb, Value ub,
Value sub = b.createOrFold<arith::SubIOp>(loc, ub, lb);
Value add = b.createOrFold<arith::AddIOp>(loc, sub, step);
- Value unclampedTrips = b.createOrFold<arith::DivSIOp>(loc, add, step);
-
- // Clamp negative trip counts to 0
- Value zero = arith::ConstantIndexOp::create(b, loc, 0);
- Value isNegative = b.createOrFold<arith::CmpIOp>(
- loc, arith::CmpIPredicate::slt, unclampedTrips, zero);
- Value trips =
- b.createOrFold<arith::SelectOp>(loc, isNegative, zero, unclampedTrips);
- return trips;
+ return b.createOrFold<arith::DivSIOp>(loc, add, step);
}
/// Get exclusive upper bound from acc.loop (add 1 if inclusive).
>From 511ed21a8e6d37cb952dc802813217ba41a1d691 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Fri, 19 Dec 2025 08:08:10 -0800
Subject: [PATCH 3/7] Spell out insertionPoint in lieu of ip
---
mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
index 6e26f0ff20ed8..069e3a27237a2 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
@@ -112,14 +112,14 @@ static Block::iterator cloneACCRegionInto(Region *src, Block *dest,
auto lastNewBlock = std::prev(postInsertBlock->getIterator());
- Block::iterator ip;
+ Block::iterator newInsertionPoint;
Operation *terminator = lastNewBlock->getTerminator();
if (auto yieldOp = dyn_cast<acc::YieldOp>(terminator)) {
- ip = std::prev(yieldOp->getIterator());
+ newInsertionPoint = std::prev(yieldOp->getIterator());
yieldOp.erase();
} else if (auto terminatorOp = dyn_cast<acc::TerminatorOp>(terminator)) {
- ip = std::prev(terminatorOp->getIterator());
+ newInsertionPoint = std::prev(terminatorOp->getIterator());
terminatorOp.erase();
} else {
llvm_unreachable("unexpected terminator in ACC region");
@@ -135,7 +135,7 @@ static Block::iterator cloneACCRegionInto(Region *src, Block *dest,
dest->getOperations().splice(dest->end(), firstNewBlock->getOperations());
firstNewBlock->erase();
- return ip;
+ return newInsertionPoint;
}
/// Wrap a multi-block region with scf.execute_region.
>From 1ee1af3b2dd54973b12f639d1fb746e46f46441f Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Fri, 19 Dec 2025 08:10:07 -0800
Subject: [PATCH 4/7] Avoid magic constant 4 in SmallVector declarations
---
mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
index 069e3a27237a2..ce8e43de7324a 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
@@ -200,7 +200,7 @@ scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, bool enableCollapse) {
// Create nested scf.for loops and build IR mapping for IVs
IRMapping mapping;
- SmallVector<scf::ForOp, 4> forOps;
+ SmallVector<scf::ForOp> forOps;
b.setInsertionPoint(loopOp);
OpBuilder nestBuilder(loopOp);
@@ -219,7 +219,7 @@ scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, bool enableCollapse) {
}
// Handle IV type conversion (index -> original type)
- SmallVector<Value, 4> scfIVs;
+ SmallVector<Value> scfIVs;
for (scf::ForOp forOp : forOps)
scfIVs.push_back(forOp.getInductionVar());
mapACCLoopIVsToSCFIVs(loopOp, scfIVs, nestBuilder, mapping);
@@ -246,7 +246,7 @@ scf::ParallelOp convertACCLoopToSCFParallel(LoopOp loopOp, OpBuilder &b) {
Location loc = loopOp->getLoc();
- SmallVector<Value, 4> lowerBounds, upperBounds, steps;
+ SmallVector<Value> lowerBounds, upperBounds, steps;
// Normalize all loops: lb=0, step=1, ub=tripCount
Value lb = arith::ConstantIndexOp::create(b, loc, 0);
>From 9bd93f6db4e5257d24148cf26ac2c5b3a027f442 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Fri, 19 Dec 2025 08:17:20 -0800
Subject: [PATCH 5/7] Use ConstantIndex and ConstantInt creates
---
mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp
index e23ff2049ca37..d88ccf87c7916 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsLoopTest.cpp
@@ -40,14 +40,12 @@ class OpenACCUtilsLoopTest : public ::testing::Test {
/// Helper to create an index constant
Value createIndexConstant(int64_t value) {
- return arith::ConstantOp::create(b, loc, b.getIndexType(),
- b.getIndexAttr(value));
+ return arith::ConstantIndexOp::create(b, loc, value);
}
/// Helper to create an i32 constant
Value createI32Constant(int32_t value) {
- return arith::ConstantOp::create(b, loc, b.getI32Type(),
- b.getI32IntegerAttr(value));
+ return arith::ConstantIntOp::create(b, loc, b.getI32Type(), value);
}
/// Helper to create a simple acc.loop with the given bounds.
>From 1a8d1658106132923acb8b5403a869e187d25436 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Mon, 22 Dec 2025 09:53:35 -0800
Subject: [PATCH 6/7] Compute exclusive upperbound as index directly
---
.../OpenACC/Utils/OpenACCUtilsLoop.cpp | 25 +++++++++----------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
index ce8e43de7324a..7e90a878f2b5b 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
@@ -45,24 +45,24 @@ static Value calculateTripCount(OpBuilder &b, Location loc, Value lb, Value ub,
}
/// Get exclusive upper bound from acc.loop (add 1 if inclusive).
-static Value getExclusiveUpperBound(acc::LoopOp loopOp, size_t ivPos,
- OpBuilder &b) {
+/// The result is always in index type.
+static Value getExclusiveUpperBoundAsIndex(acc::LoopOp loopOp, size_t ivPos,
+ OpBuilder &b) {
bool isInclusive = false;
if (loopOp.getInclusiveUpperbound().has_value())
isInclusive = loopOp.getInclusiveUpperboundAttr().asArrayRef()[ivPos];
Value origUB = loopOp.getUpperbound()[ivPos];
+ Location loc = origUB.getLoc();
+ Type indexType = b.getIndexType();
+
+ // Cast to index first, then add if inclusive
+ Value ub = getValueOrCreateCastToIndexLike(b, loc, indexType, origUB);
if (isInclusive) {
- Location loc = origUB.getLoc();
- Value one;
- Type ubType = origUB.getType();
- if (ubType.isIndex())
- one = arith::ConstantIndexOp::create(b, loc, 1);
- else
- one = arith::ConstantIntOp::create(b, loc, ubType, 1);
- return b.createOrFold<arith::AddIOp>(loc, origUB, one);
+ Value one = arith::ConstantIndexOp::create(b, loc, 1);
+ ub = b.createOrFold<arith::AddIOp>(loc, ub, one);
}
- return origUB;
+ return ub;
}
/// Handle differing types between SCF (index) and ACC loops.
@@ -189,8 +189,7 @@ scf::ForOp convertACCLoopToSCFFor(LoopOp loopOp, bool enableCollapse) {
Value newLowerBound = getValueOrCreateCastToIndexLike(
b, loc, indexType, accLoopOp.getLowerbound()[idx]);
- Value newUpperBound = getValueOrCreateCastToIndexLike(
- b, loc, indexType, getExclusiveUpperBound(accLoopOp, idx, b));
+ Value newUpperBound = getExclusiveUpperBoundAsIndex(accLoopOp, idx, b);
Value newStep = getValueOrCreateCastToIndexLike(b, loc, indexType,
accLoopOp.getStep()[idx]);
>From dc8bcb495d1b725fdf8519008dff7e7c70bd8780 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Mon, 22 Dec 2025 09:59:17 -0800
Subject: [PATCH 7/7] Add nsw flag
---
.../Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
index 7e90a878f2b5b..c0eeb4cfc4d2c 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsLoop.cpp
@@ -36,11 +36,14 @@ static Value calculateTripCount(OpBuilder &b, Location loc, Value lb, Value ub,
if (!inclusiveUpperbound) {
Value one = arith::ConstantIndexOp::create(b, loc, 1);
- ub = b.createOrFold<arith::SubIOp>(loc, ub, one);
+ ub = b.createOrFold<arith::SubIOp>(loc, ub, one,
+ arith::IntegerOverflowFlags::nsw);
}
- Value sub = b.createOrFold<arith::SubIOp>(loc, ub, lb);
- Value add = b.createOrFold<arith::AddIOp>(loc, sub, step);
+ Value sub = b.createOrFold<arith::SubIOp>(loc, ub, lb,
+ arith::IntegerOverflowFlags::nsw);
+ Value add = b.createOrFold<arith::AddIOp>(loc, sub, step,
+ arith::IntegerOverflowFlags::nsw);
return b.createOrFold<arith::DivSIOp>(loc, add, step);
}
@@ -60,7 +63,8 @@ static Value getExclusiveUpperBoundAsIndex(acc::LoopOp loopOp, size_t ivPos,
Value ub = getValueOrCreateCastToIndexLike(b, loc, indexType, origUB);
if (isInclusive) {
Value one = arith::ConstantIndexOp::create(b, loc, 1);
- ub = b.createOrFold<arith::AddIOp>(loc, ub, one);
+ ub = b.createOrFold<arith::AddIOp>(loc, ub, one,
+ arith::IntegerOverflowFlags::nsw);
}
return ub;
}
@@ -88,8 +92,10 @@ static void normalizeIVUses(OpBuilder &b, Location loc, Value iv, Value origLB,
Value step = getValueOrCreateCastToIndexLike(b, loc, indexType, origStep);
// new_iv * step + lb
- Value scaled = arith::MulIOp::create(b, loc, iv, step);
- Value denormalized = arith::AddIOp::create(b, loc, scaled, lb);
+ Value scaled =
+ arith::MulIOp::create(b, loc, iv, step, arith::IntegerOverflowFlags::nsw);
+ Value denormalized = arith::AddIOp::create(b, loc, scaled, lb,
+ arith::IntegerOverflowFlags::nsw);
// Replace uses of iv with denormalized value, except for the ops that
// compute the denormalized value itself (muli and addi)
More information about the Mlir-commits
mailing list