[Mlir-commits] [mlir] [mlir][acc] Add ACCComputeLowering pass (PR #185501)
Razvan Lupusoru
llvmlistbot at llvm.org
Tue Mar 10 10:46:55 PDT 2026
https://github.com/razvanlupusoru updated https://github.com/llvm/llvm-project/pull/185501
>From 7f575dfc69243f3543d3075caa0a66453cf9d8bf Mon Sep 17 00:00:00 2001
From: Scott Manley <rscottmanley at gmail.com>
Date: Mon, 9 Mar 2026 12:41:25 -0700
Subject: [PATCH 1/3] [mlir][acc] Add ACCComputeLowering pass
Introduce a pass that lowers OpenACC compute constructs to a
representation that separates the data environment from the
compute body and prepares for parallelism assignment and
privatization at the right granularity.
- Decompose acc.parallel, acc.serial, and acc.kernels into
acc.kernel_environment and acc.compute_region. Launch arguments
(num_gangs, num_workers, vector_length) are turned into
acc.par_width and passed as compute_region launch operands.
- Convert acc.loop to SCF based on context: unstructured loops to
scf.execute_region; sequential (serial or seq) to scf.parallel
with par_dims=sequential; auto loops to scf.for (with collapse
when multi-dimensional); orphan loops to scf.for; independent
loops in parallel/kernels to scf.parallel with par_dims from the
GPU mapping.
---
.../mlir/Dialect/OpenACC/OpenACCCGOps.td | 11 +
.../mlir/Dialect/OpenACC/OpenACCUtils.h | 5 +-
.../mlir/Dialect/OpenACC/OpenACCUtilsCG.h | 24 +-
.../mlir/Dialect/OpenACC/OpenACCUtilsLoop.h | 16 +-
.../mlir/Dialect/OpenACC/Transforms/Passes.td | 71 ++--
mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 28 +-
.../OpenACC/Transforms/ACCComputeLowering.cpp | 361 ++++++++++++++++++
.../Dialect/OpenACC/Transforms/CMakeLists.txt | 3 +
.../Dialect/OpenACC/Utils/OpenACCUtils.cpp | 3 +-
.../Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp | 53 +++
.../OpenACC/acc-compute-lowering-compute.mlir | 107 ++++++
.../OpenACC/acc-compute-lowering-loop.mlir | 130 +++++++
.../acc-compute-lowering-unstructured.mlir | 34 ++
.../Dialect/OpenACC/OpenACCUtilsCGTest.cpp | 77 +++-
14 files changed, 879 insertions(+), 44 deletions(-)
create mode 100644 mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
create mode 100644 mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir
create mode 100644 mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
create mode 100644 mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
index ebb0e6132fee3..f6ae871eb9936 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
@@ -181,6 +181,17 @@ def OpenACC_KernelEnvironmentOp
}];
let hasCanonicalizer = 1;
+
+ let extraClassDeclaration = [{
+ /// Create a `KernelEnvironmentOp` populated with data mapping, async, and
+ /// wait clauses extracted from the given ACC compute construct. Emplaces
+ /// a block in the region and sets the rewriter's insertion point to the
+ /// start of that block so callers can create operations inside it
+ /// (e.g., `acc.compute_region`).
+ template <typename ComputeConstructT>
+ static KernelEnvironmentOp createAndPopulate(
+ ComputeConstructT computeConstruct, ::mlir::OpBuilder &builder);
+ }];
}
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index aac3bf7ed67c8..dd3d34b8252d3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -10,10 +10,13 @@
#define MLIR_DIALECT_OPENACC_OPENACCUTILS_H_
#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/Remarks.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
-#include <functional>
+#include "llvm/ADT/Twine.h"
+#include <optional>
+#include <string>
namespace mlir {
class DominanceInfo;
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
index 7bead720b1077..b5fdcca761a4a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
@@ -14,12 +14,12 @@
#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSCG_H_
#define MLIR_DIALECT_OPENACC_OPENACCUTILSCG_H_
+#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/IR/IRMapping.h"
#include <optional>
namespace mlir {
-class Operation;
-
namespace acc {
/// Get the data layout for an operation.
@@ -34,6 +34,26 @@ namespace acc {
std::optional<DataLayout> getDataLayout(Operation *op,
bool allowDefault = true);
+/// Build an `acc.compute_region` operation by cloning a source region.
+///
+/// Creates a new `acc.compute_region` with the given launch arguments and
+/// origin string, then clones the operations from `regionToClone` into its
+/// body. Multi-block regions are wrapped with `scf.execute_region`.
+///
+/// The `mapping` is used and updated during cloning, allowing callers to
+/// track value correspondences. Optional `output`, `kernelFuncName`,
+/// `kernelModuleName`, and `stream` arguments are forwarded to the op.
+ComputeRegionOp buildComputeRegion(Location loc, ValueRange launchArgs,
+ ValueRange inputArgs,
+ llvm::StringRef origin,
+ Region ®ionToClone,
+ RewriterBase &rewriter,
+ IRMapping &mapping,
+ ValueRange output = {},
+ FlatSymbolRefAttr kernelFuncName = {},
+ FlatSymbolRefAttr kernelModuleName = {},
+ Value stream = {});
+
} // namespace acc
} // namespace mlir
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
index 67ae2d1ede04e..e0428bec620ca 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
@@ -13,22 +13,12 @@
#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
#define MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
-#include "mlir/IR/Block.h"
-#include "mlir/IR/ValueRange.h"
-#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/IRMapping.h"
namespace mlir {
-class IRMapping;
-class Location;
-class Region;
-class RewriterBase;
-namespace scf {
-class ForOp;
-class ParallelOp;
-class ExecuteRegionOp;
-} // namespace scf
namespace acc {
-class LoopOp;
/// Clone an ACC region into a destination block at the given insertion point.
/// Requires a single-block source region. Maps block arguments and optional
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 8e00846255254..9ab99208f83c7 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -11,6 +11,26 @@
include "mlir/Pass/PassBase.td"
+//===----------------------------------------------------------------------===//
+// Common options shared by multiple ACC passes
+//===----------------------------------------------------------------------===//
+
+def AccDeviceTypeOption : Option<"deviceType", "device-type",
+ "mlir::acc::DeviceType", "mlir::acc::DeviceType::None",
+ "Target device type. One use case is ensuring that device_type-specific "
+ "clauses are considered. Another is device-specific specializations.",
+ [{::llvm::cl::values(
+ clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
+ clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
+ clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
+ clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
+ clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
+ }]>;
+
+//===----------------------------------------------------------------------===//
+// Pass definitions
+//===----------------------------------------------------------------------===//
+
def LegalizeDataValuesInRegion : Pass<"openacc-legalize-data-values", "mlir::func::FuncOp"> {
let summary = "Legalizes SSA values in compute regions with results from data clause operations";
let description = [{
@@ -120,20 +140,7 @@ def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> {
while avoiding infinite recursion through proper tracking.
}];
let dependentDialects = ["mlir::acc::OpenACCDialect"];
- let options = [
- Option<"deviceType", "device-type", "mlir::acc::DeviceType",
- "mlir::acc::DeviceType::None",
- "Target device type for implicit routine generation. "
- "Ensures that `acc routine` device_type clauses are "
- "properly considered not just default clauses.",
- [{::llvm::cl::values(
- clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
- clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
- clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
- clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
- clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
- }]>
- ];
+ let options = [ AccDeviceTypeOption ];
}
def ACCDeclareGPUModuleInsertion : Pass<"acc-declare-gpu-module-insertion", "mlir::ModuleOp"> {
@@ -412,21 +419,35 @@ def OffloadTargetVerifier : Pass<"offload-target-verifier", "mlir::func::FuncOp"
}];
let dependentDialects = ["mlir::acc::OpenACCDialect"];
let options = [
- Option<"deviceType", "device-type", "mlir::acc::DeviceType",
- "mlir::acc::DeviceType::None",
- "Target device type for verification. Host/multicore uses host "
- "region checking, all others use device region checking.",
- [{::llvm::cl::values(
- clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
- clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
- clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
- clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
- clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
- }]>,
+ AccDeviceTypeOption,
Option<"softCheck", "soft-check", "bool", "false",
"When true, illegal values are printed via LLVM_DEBUG instead of "
"failing compilation. Useful for diagnostic purposes.">
];
}
+def ACCComputeLowering : Pass<"acc-compute-lowering", "mlir::func::FuncOp"> {
+ let summary = "Lower ACC compute constructs to acc.compute_region";
+ let description = [{
+ Converts ACC frontend compute constructs (`acc.parallel`, `acc.kernels`,
+ `acc.serial`) to `acc.compute_region` wrapped in `acc.kernel_environment`.
+ Converts `acc.loop` to SCF parallel/for loops with parallel dimension
+ annotations.
+
+ The pass applies two phases of pattern rewrites:
+ 1. Loop conversion: `acc.loop` is converted to `scf.parallel` or `scf.for`
+ while the parent compute construct is still present (needed to determine
+ loop conversion strategy).
+ 2. Compute construct conversion: `acc.parallel`, `acc.kernels`, and
+ `acc.serial` are replaced by `acc.kernel_environment` containing
+ `acc.compute_region`.
+ }];
+ let dependentDialects = [
+ "mlir::acc::OpenACCDialect",
+ "mlir::arith::ArithDialect",
+ "mlir::scf::SCFDialect"
+ ];
+ let options = [ AccDeviceTypeOption ];
+}
+
#endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index e77a955ef3045..ba677082ba4e2 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -218,6 +218,31 @@ void KernelEnvironmentOp::getCanonicalizationPatterns(
results.add<RemoveEmptyKernelEnvironment>(context);
}
+template <typename ComputeConstructT>
+KernelEnvironmentOp KernelEnvironmentOp::createAndPopulate(
+ ComputeConstructT computeConstruct, OpBuilder &builder) {
+ auto kernelEnvironment = KernelEnvironmentOp::create(
+ builder, computeConstruct->getLoc(),
+ computeConstruct.getDataClauseOperands(),
+ computeConstruct.getAsyncOperands(),
+ computeConstruct.getAsyncOperandsDeviceTypeAttr(),
+ computeConstruct.getAsyncOnlyAttr(), computeConstruct.getWaitOperands(),
+ computeConstruct.getWaitOperandsSegmentsAttr(),
+ computeConstruct.getWaitOperandsDeviceTypeAttr(),
+ computeConstruct.getHasWaitDevnumAttr(),
+ computeConstruct.getWaitOnlyAttr());
+ Block &block = kernelEnvironment.getRegion().emplaceBlock();
+ builder.setInsertionPointToStart(&block);
+ return kernelEnvironment;
+}
+
+template KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate<ParallelOp>(ParallelOp, OpBuilder &);
+template KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate<KernelsOp>(KernelsOp, OpBuilder &);
+template KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate<SerialOp>(SerialOp, OpBuilder &);
+
//===----------------------------------------------------------------------===//
// FirstprivateMapInitialOp
//===----------------------------------------------------------------------===//
@@ -417,7 +442,8 @@ SmallVector<GPUParallelDimAttr> ComputeRegionOp::getLaunchParDims() {
Value ComputeRegionOp::getOperand(BlockArgument blockArg) {
unsigned argNumber = blockArg.getArgNumber();
unsigned numLaunchArgs = getLaunchArgs().size();
- assert(argNumber < (numLaunchArgs + getInputArgs().size()) &&
+ unsigned numInputArgs = getInputArgs().size();
+ assert(argNumber < (numLaunchArgs + numInputArgs) &&
"invalid block argument");
if (argNumber < numLaunchArgs)
return getLaunchArgs()[argNumber];
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
new file mode 100644
index 0000000000000..8bb5dc9eb43d7
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
@@ -0,0 +1,361 @@
+//===- ACCComputeLowering.cpp - Lower ACC compute to compute_region -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass decomposes OpenACC compute constructs into a representation that
+// separates the data environment from the compute portion and prepares for
+// parallelism assignment and privatization at the appropriate level.
+//
+// Overview:
+// ---------
+// Each compute construct (`acc.parallel`, `acc.serial`, `acc.kernels`) is
+// lowered to (1) `acc.kernel_environment`, which captures the data environment
+// and (2) `acc.compute_region`, which holds the compute body. Inside the
+// compute region, acc.loop is converted to SCF loops (`scf.parallel` or
+// `scf.for`) with any predetermined parallelism expressed as `par_dims`. This
+// decomposition allows later phases to assign parallelism and handle
+// privatization at the right granularity.
+//
+// Transformations:
+// ----------------
+// 1. Compute constructs: acc.parallel, acc.serial, and acc.kernels are
+// replaced by acc.kernel_environment containing a single acc.compute_region.
+// Launch arguments (num_gangs, num_workers, vector_length) become
+// acc.par_width ops and are passed as compute_region launch operands.
+//
+// 2. acc.loop: Converted according to context and attributes:
+// - Unstructured: body wrapped in scf.execute_region.
+// - Sequential (serial region or seq clause): scf.parallel with
+// par_dims = sequential.
+// - Auto (in parallel/kernels): scf.for with collapse when
+// multi-dimensional.
+// - Orphan (not inside a compute construct): scf.for, no collapse.
+// - Independent (in parallel/kernels): scf.parallel with par_dims from
+// gang/worker/vector mapping (e.g. block_x).
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenACC/OpenACCParMapping.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCCOMPUTELOWERING
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+#define DEBUG_TYPE "acc-compute-lowering"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+/// Strip index_cast operations from a value before checking for a constant.
+static Value stripIndexCasts(Value val) {
+ while (auto castOp = val.getDefiningOp<arith::IndexCastOp>())
+ val = castOp.getIn();
+ return val;
+}
+
+/// A parallel construct is "effectively serial" when it specifies
+/// num_gangs(1), num_workers(1), and vector_length(1). This matches
+/// the semantics of acc.serial but expressed through acc.parallel.
+static bool isEffectivelySerial(ParallelOp op) {
+ auto numGangs = op.getNumGangsValues();
+ if (numGangs.size() != 1)
+ return false;
+ Value numWorkers = op.getNumWorkersValue();
+ if (!numWorkers)
+ return false;
+ Value vectorLength = op.getVectorLengthValue();
+ if (!vectorLength)
+ return false;
+ return isConstantIntValue(stripIndexCasts(numGangs.front()), 1) &&
+ isConstantIntValue(stripIndexCasts(numWorkers), 1) &&
+ isConstantIntValue(stripIndexCasts(vectorLength), 1);
+}
+
+static bool isOpInComputeRegion(Operation *op) {
+ Region *region = op->getBlock()->getParent();
+ return getEnclosingComputeOp(*region) != nullptr;
+}
+
+static bool isOpInSerialRegion(Operation *op) {
+ if (auto parallelOp = op->getParentOfType<ParallelOp>())
+ return isEffectivelySerial(parallelOp);
+ if (auto computeRegion = op->getParentOfType<ComputeRegionOp>())
+ return computeRegion.isEffectivelySerial();
+ if (op->getParentOfType<SerialOp>())
+ return true;
+ return false;
+}
+
+static void setParDimsAttr(Operation *op, GPUParallelDimsAttr attr) {
+ op->setAttr(GPUParallelDimsAttr::name, attr);
+}
+
+/// Insert a parallel dimension into the list, maintaining order by
+/// GPUParallelDimAttr::getOrder (descending).
+static void insertParDim(SmallVectorImpl<GPUParallelDimAttr> &parDims,
+ GPUParallelDimAttr parDim) {
+ GPUParallelDimAttr *lb = llvm::lower_bound(
+ parDims, parDim,
+ [](const GPUParallelDimAttr &a, const GPUParallelDimAttr &b) {
+ return a.getOrder() > b.getOrder();
+ });
+ if (lb == parDims.end() || *lb != parDim)
+ parDims.insert(lb, parDim);
+}
+
+/// Map loop parallelism clauses (gang/worker/vector) to GPU parallel
+/// dimensions using the given mapping policy.
+static SmallVector<GPUParallelDimAttr>
+getParallelDimensions(LoopOp loopOp, const ACCToGPUMappingPolicy &policy,
+ DeviceType deviceType) {
+ SmallVector<GPUParallelDimAttr> parDims;
+ auto *ctx = loopOp->getContext();
+
+ if (loopOp.hasVector(deviceType))
+ insertParDim(parDims, policy.vectorDim(ctx));
+ if (loopOp.hasWorker(deviceType))
+ insertParDim(parDims, policy.workerDim(ctx));
+ if (auto gangDimValue = loopOp.getGangValue(GangArgType::Dim, deviceType)) {
+ if (auto gangDimDefOp =
+ gangDimValue.getDefiningOp<arith::ConstantIntOp>()) {
+ auto gangLevel = getGangParLevel(gangDimDefOp.value());
+ insertParDim(parDims, policy.gangDim(ctx, gangLevel));
+ }
+ } else if (loopOp.hasGang(deviceType)) {
+ insertParDim(parDims, policy.gangDim(ctx, ParLevel::gang_dim1));
+ }
+ return parDims;
+}
+
+/// Create acc.par_width operations from gang/worker/vector values of a
+/// compute construct. Queries the device-type-specific values first, falling
+/// back to the default (DeviceType::None) values.
+template <typename ComputeConstructT>
+static SmallVector<Value>
+assignKnownLaunchArgs(ComputeConstructT computeOp, DeviceType deviceType,
+ RewriterBase &rewriter,
+ const ACCToGPUMappingPolicy &policy) {
+ SmallVector<Value> values;
+ auto *ctx = rewriter.getContext();
+ auto indexTy = rewriter.getIndexType();
+ auto loc = computeOp->getLoc();
+
+ auto numGangs = computeOp.getNumGangsValues(deviceType);
+ if (numGangs.empty())
+ numGangs = computeOp.getNumGangsValues();
+ for (auto [gangDimIdx, gangSize] : llvm::enumerate(numGangs)) {
+ auto gangLevel = getGangParLevel(gangDimIdx + 1);
+ values.push_back(
+ ParWidthOp::create(rewriter, loc,
+ getValueOrCreateCastToIndexLike(
+ rewriter, gangSize.getLoc(), indexTy, gangSize),
+ policy.gangDim(ctx, gangLevel)));
+ }
+
+ Value numWorkers = computeOp.getNumWorkersValue(deviceType);
+ if (!numWorkers)
+ numWorkers = computeOp.getNumWorkersValue();
+ if (numWorkers) {
+ values.push_back(ParWidthOp::create(
+ rewriter, loc,
+ getValueOrCreateCastToIndexLike(rewriter, numWorkers.getLoc(), indexTy,
+ numWorkers),
+ policy.workerDim(ctx)));
+ }
+
+ Value vectorLength = computeOp.getVectorLengthValue(deviceType);
+ if (!vectorLength)
+ vectorLength = computeOp.getVectorLengthValue();
+ if (vectorLength) {
+ values.push_back(ParWidthOp::create(
+ rewriter, loc,
+ getValueOrCreateCastToIndexLike(rewriter, vectorLength.getLoc(),
+ indexTy, vectorLength),
+ policy.vectorDim(ctx)));
+ }
+ return values;
+}
+
+/// SerialOp has no gang/worker/vector clauses.
+template <>
+SmallVector<Value>
+assignKnownLaunchArgs<SerialOp>(SerialOp, DeviceType, RewriterBase &,
+ const ACCToGPUMappingPolicy &) {
+ return {};
+}
+
+//===----------------------------------------------------------------------===//
+// Loop conversion pattern
+//===----------------------------------------------------------------------===//
+
+class ACCLoopConversion : public OpRewritePattern<LoopOp> {
+public:
+ ACCLoopConversion(MLIRContext *ctx, const ACCToGPUMappingPolicy &policy,
+ DeviceType deviceType)
+ : OpRewritePattern<LoopOp>(ctx), policy(policy), deviceType(deviceType) {}
+
+ LogicalResult matchAndRewrite(LoopOp loopOp,
+ PatternRewriter &rewriter) const override {
+ if (loopOp.getUnstructured()) {
+ auto executeRegion =
+ convertUnstructuredACCLoopToSCFExecuteRegion(loopOp, rewriter);
+ if (!executeRegion)
+ return failure();
+ rewriter.replaceOp(loopOp, executeRegion);
+ return success();
+ }
+
+ LoopParMode parMode = loopOp.getDefaultOrDeviceTypeParallelism(deviceType);
+
+ if (parMode == LoopParMode::loop_seq || isOpInSerialRegion(loopOp)) {
+ // Although it might seem unintuitive, scf.parallel is used here because
+ // the parallelism of the loop is already predetermined (as sequential).
+ // scf.for will become a candidate for auto-parallelization analysis.
+ auto parallelOp = convertACCLoopToSCFParallel(loopOp, rewriter);
+ if (!parallelOp)
+ return failure();
+ setParDimsAttr(parallelOp,
+ GPUParallelDimsAttr::seq(loopOp->getContext()));
+ rewriter.replaceOp(loopOp, parallelOp);
+ } else if (parMode == LoopParMode::loop_auto) {
+ // All loops in serial regions should have already been handled.
+ assert(!isOpInSerialRegion(loopOp) &&
+ "Expected loop to be in non-serial region");
+ // Mark as scf.for to allow auto-parallelization analysis later.
+ auto forOp =
+ convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/true);
+ if (!forOp)
+ return failure();
+ rewriter.replaceOp(loopOp, forOp);
+ } else if (!isOpInComputeRegion(loopOp)) {
+ // This loop is an orphan `acc loop` but it is not in any sort
+ // of compute region. Thus it is just a sequential non-accelerator loop.
+ auto forOp =
+ convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/false);
+ if (!forOp)
+ return failure();
+ rewriter.replaceOp(loopOp, forOp);
+ } else {
+ assert(parMode == LoopParMode::loop_independent &&
+ "Expected loop to be independent");
+ auto parallelOp = convertACCLoopToSCFParallel(loopOp, rewriter);
+ if (!parallelOp)
+ return failure();
+
+ SmallVector<GPUParallelDimAttr> parDims =
+ getParallelDimensions(loopOp, policy, deviceType);
+ if (!parDims.empty()) {
+ auto parDimsAttr =
+ GPUParallelDimsAttr::get(loopOp->getContext(), parDims);
+ setParDimsAttr(parallelOp, parDimsAttr);
+ }
+
+ rewriter.replaceOp(loopOp, parallelOp);
+ }
+ return success();
+ }
+
+private:
+ const ACCToGPUMappingPolicy &policy;
+ DeviceType deviceType;
+};
+
+//===----------------------------------------------------------------------===//
+// Compute construct conversion pattern
+//===----------------------------------------------------------------------===//
+
+template <typename ComputeConstructT>
+class ComputeOpConversion : public OpRewritePattern<ComputeConstructT> {
+public:
+ ComputeOpConversion(MLIRContext *ctx, const ACCToGPUMappingPolicy &policy,
+ DeviceType deviceType)
+ : OpRewritePattern<ComputeConstructT>(ctx), policy(policy),
+ deviceType(deviceType) {}
+
+ LogicalResult matchAndRewrite(ComputeConstructT computeOp,
+ PatternRewriter &rewriter) const override {
+ rewriter.setInsertionPoint(computeOp);
+ auto kernelEnv =
+ KernelEnvironmentOp::createAndPopulate(computeOp, rewriter);
+ auto launchArgs =
+ assignKnownLaunchArgs(computeOp, deviceType, rewriter, policy);
+ Region ®ion = computeOp.getRegion();
+ SetVector<Value> liveInValues;
+ getUsedValuesDefinedAbove(region, region, liveInValues);
+ IRMapping mapping;
+ auto computeRegion = buildComputeRegion(
+ computeOp->getLoc(), launchArgs, liveInValues.getArrayRef(),
+ ComputeConstructT::getOperationName(), region, rewriter, mapping);
+ if (!computeRegion) {
+ rewriter.eraseOp(kernelEnv);
+ return failure();
+ }
+ rewriter.eraseOp(computeOp);
+ return success();
+ }
+
+private:
+ const ACCToGPUMappingPolicy &policy;
+ DeviceType deviceType;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass implementation
+//===----------------------------------------------------------------------===//
+
+class ACCComputeLowering
+ : public acc::impl::ACCComputeLoweringBase<ACCComputeLowering> {
+public:
+ using ACCComputeLoweringBase::ACCComputeLoweringBase;
+
+ void runOnOperation() override {
+ auto op = getOperation();
+ auto *context = op.getContext();
+
+ DefaultACCToGPUMappingPolicy policy;
+
+ // Part 1: Convert acc.loop to scf.parallel/scf.for while the parent
+ // compute construct is still present (needed to determine conversion
+ // strategy).
+ RewritePatternSet loopPatterns(context);
+ loopPatterns.insert<ACCLoopConversion>(context, policy, deviceType);
+ if (failed(applyPatternsGreedily(op, std::move(loopPatterns))))
+ return signalPassFailure();
+
+ // Part 2: Convert acc.parallel, acc.kernels, and acc.serial to
+ // acc.kernel_environment { acc.compute_region { ... } }.
+ RewritePatternSet computePatterns(context);
+ computePatterns
+ .insert<ComputeOpConversion<ParallelOp>, ComputeOpConversion<KernelsOp>,
+ ComputeOpConversion<SerialOp>>(context, policy, deviceType);
+ if (failed(applyPatternsGreedily(op, std::move(computePatterns))))
+ return signalPassFailure();
+ }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 3d85fd805ace1..1bb16b4b9642d 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
add_mlir_dialect_library(MLIROpenACCTransforms
+ ACCComputeLowering.cpp
ACCDeclareGPUModuleInsertion.cpp
ACCIfClauseLowering.cpp
ACCImplicitData.cpp
@@ -27,6 +28,8 @@ add_mlir_dialect_library(MLIROpenACCTransforms
LINK_LIBS PUBLIC
MLIRAnalysis
+ MLIRArithDialect
+ MLIRArithUtils
MLIROpenACCAnalysis
MLIROpenACCDialect
MLIROpenACCUtils
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 911f256a3d2a6..1c63760a6984b 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -21,7 +21,8 @@
#include "llvm/Support/Casting.h"
mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region ®ion) {
- return region.getParentOfType<ACC_COMPUTE_CONSTRUCT_OPS>();
+ return region
+ .getParentOfType<ACC_COMPUTE_CONSTRUCT_OPS, mlir::acc::ComputeRegionOp>();
}
template <typename OpTy>
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp
index 5c5c453f2cae0..f5e0e5c33fee4 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp
@@ -11,7 +11,10 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
namespace mlir {
namespace acc {
@@ -51,5 +54,55 @@ std::optional<DataLayout> getDataLayout(Operation *op, bool allowDefault) {
return std::nullopt;
}
+ComputeRegionOp
+buildComputeRegion(Location loc, ValueRange launchArgs, ValueRange inputArgs,
+ llvm::StringRef origin, Region ®ionToClone,
+ RewriterBase &rewriter, IRMapping &mapping,
+ ValueRange output, FlatSymbolRefAttr kernelFuncName,
+ FlatSymbolRefAttr kernelModuleName, Value stream) {
+ SmallVector<Type> resultTypes;
+ for (auto val : output)
+ resultTypes.push_back(val.getType());
+ auto computeRegion =
+ ComputeRegionOp::create(rewriter, loc, resultTypes, launchArgs, inputArgs,
+ stream, origin, kernelFuncName, kernelModuleName);
+
+ assert(!regionToClone.getBlocks().empty() &&
+ "empty region for acc.compute_region");
+ OpBuilder::InsertionGuard guard(rewriter);
+
+ auto parWidthType = ParWidthType::get(rewriter.getContext());
+ Block *entryBlock = rewriter.createBlock(&computeRegion.getRegion());
+ for (size_t i = 0; i < launchArgs.size(); ++i)
+ entryBlock->addArgument(parWidthType, loc);
+ for (Value input : inputArgs)
+ entryBlock->addArgument(input.getType(), loc);
+ for (size_t i = 0; i < inputArgs.size(); ++i)
+ mapping.map(inputArgs[i], entryBlock->getArgument(launchArgs.size() + i));
+ rewriter.setInsertionPointToStart(entryBlock);
+ if (regionToClone.getBlocks().size() == 1) {
+ for (auto &op : regionToClone.front().getOperations()) {
+ if (op.hasTrait<OpTrait::IsTerminator>())
+ break;
+ rewriter.clone(op, mapping);
+ }
+ } else {
+ auto exeRegion = mlir::acc::wrapMultiBlockRegionWithSCFExecuteRegion(
+ regionToClone, mapping, loc, rewriter);
+ if (!exeRegion) {
+ rewriter.eraseOp(computeRegion);
+ return nullptr;
+ }
+ }
+
+ SmallVector<Value> yieldOperands;
+ for (auto val : output)
+ yieldOperands.push_back(mapping.lookup(val));
+ rewriter.setInsertionPointToEnd(entryBlock);
+ YieldOp::create(rewriter, loc, yieldOperands);
+
+ return computeRegion;
+}
+
} // namespace acc
} // namespace mlir
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir
new file mode 100644
index 0000000000000..77c4ba94c4f18
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir
@@ -0,0 +1,107 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// CHECK-LABEL: func.func @parallel_gang_loop
+func.func @parallel_gang_loop(%buf: memref<1xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1_i32 = arith.constant 1 : i32
+ %c10_i32 = arith.constant 10 : i32
+ %c100_i32 = arith.constant 100 : i32
+
+ %dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
+ // CHECK-NOT: acc.parallel
+ // CHECK: acc.kernel_environment
+ // CHECK: acc.par_width {{.*}} {par_dim = #acc.par_dim<block_x>}
+ // CHECK: acc.compute_region launch(
+ // CHECK: scf.parallel
+ // CHECK: acc.par_dims = #acc<par_dims[block_x]>
+ acc.parallel num_gangs({%c10_i32 : i32}) dataOperands(%dev : memref<1xi32>) {
+ acc.loop gang control(%arg0 : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
+ memref.store %arg0, %dev[%c0] : memref<1xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel_seq_loop
+func.func @parallel_seq_loop(%buf: memref<4xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+ %c10_i32 = arith.constant 10 : i32
+
+ %dev = acc.copyin varPtr(%buf : memref<4xi32>) -> memref<4xi32>
+ // CHECK-NOT: acc.parallel
+ // CHECK: acc.kernel_environment
+ // CHECK: acc.par_width {{.*}} {par_dim = #acc.par_dim<block_x>}
+ // CHECK: acc.compute_region launch(
+ // CHECK: scf.parallel
+ // CHECK: acc.par_dims = #acc<par_dims[sequential]>
+ acc.parallel num_gangs({%c10_i32 : i32}) dataOperands(%dev : memref<4xi32>) {
+ acc.loop control(%i : index) = (%c0 : index) to (%c4 : index) step (%c1 : index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%i] : memref<4xi32>
+ acc.yield
+ } attributes {seq = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<4xi32>) to varPtr(%buf : memref<4xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @serial_loop
+func.func @serial_loop(%buf: memref<4xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+
+ %dev = acc.copyin varPtr(%buf : memref<4xi32>) -> memref<4xi32>
+ // CHECK-NOT: acc.serial
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.parallel
+ // CHECK: acc.par_dims = #acc<par_dims[sequential]>
+ acc.serial dataOperands(%dev : memref<4xi32>) {
+ acc.loop control(%i : index) = (%c0 : index) to (%c4 : index) step (%c1 : index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%i] : memref<4xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<4xi32>) to varPtr(%buf : memref<4xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @kernels_loop
+func.func @kernels_loop(%buf: memref<8xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+
+ %dev = acc.copyin varPtr(%buf : memref<8xi32>) -> memref<8xi32>
+ // CHECK-NOT: acc.kernels
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.parallel
+ acc.kernels dataOperands(%dev : memref<8xi32>) {
+ acc.loop control(%i : index) = (%c0 : index) to (%c8 : index) step (%c1 : index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%i] : memref<8xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.terminator
+ }
+ acc.copyout accPtr(%dev : memref<8xi32>) to varPtr(%buf : memref<8xi32>)
+ return
+}
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
new file mode 100644
index 0000000000000..4a5ea390233e2
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
@@ -0,0 +1,130 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// CHECK-LABEL: func.func @parallel_independent_loop
+func.func @parallel_independent_loop(%buf: memref<16xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c16 = arith.constant 16 : index
+
+ %dev = acc.copyin varPtr(%buf : memref<16xi32>) -> memref<16xi32>
+ // CHECK-NOT: acc.parallel
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.parallel
+ acc.parallel dataOperands(%dev : memref<16xi32>) {
+ acc.loop control(%i : index) = (%c0 : index) to (%c16 : index) step (%c1 : index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%i] : memref<16xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<16xi32>) to varPtr(%buf : memref<16xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel_loop_multi_block_body
+func.func @parallel_loop_multi_block_body(%buf: memref<4xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c4 = arith.constant 4 : index
+
+ %dev = acc.copyin varPtr(%buf : memref<4xi32>) -> memref<4xi32>
+ // CHECK-NOT: acc.parallel
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.parallel
+ // CHECK: scf.execute_region
+ acc.parallel dataOperands(%dev : memref<4xi32>) {
+ acc.loop control(%i : index) = (%c0 : index) to (%c4 : index) step (%c1 : index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%i] : memref<4xi32>
+ cf.br ^bb1
+ ^bb1:
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<4xi32>) to varPtr(%buf : memref<4xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel_loop_auto_collapse
+func.func @parallel_loop_auto_collapse(%buf: memref<1xi32>, %lb0 : index, %ub0 : index, %lb1 : index, %ub1 : index) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+
+ %dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
+ // CHECK-NOT: acc.parallel
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.for
+ // CHECK-NOT: scf.for
+ // CHECK-NOT: scf.parallel
+ acc.parallel dataOperands(%dev : memref<1xi32>) {
+ acc.loop control(%i : index, %j : index) = (%lb0, %lb1 : index, index) to (%ub0, %ub1 : index, index) step (%c1, %c1 : index, index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%c0] : memref<1xi32>
+ acc.yield
+ } attributes {auto_ = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @serial_loop_normalized
+func.func @serial_loop_normalized(%buf: memref<1xi32>) {
+ %c0 = arith.constant 0 : index
+ %c2 = arith.constant 2 : index
+ %c5 = arith.constant 5 : index
+ %c9 = arith.constant 9 : index
+
+ %dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
+ // CHECK-NOT: acc.serial
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.parallel
+ // CHECK-DAG: arith.muli
+ // CHECK-DAG: arith.addi
+ // CHECK: acc.par_dims = #acc<par_dims[sequential]>
+ acc.serial dataOperands(%dev : memref<1xi32>) {
+ acc.loop control(%i : index) = (%c5 : index) to (%c9 : index) step (%c2 : index) {
+ %vi = arith.index_cast %i : index to i32
+ memref.store %vi, %dev[%c0] : memref<1xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
+ return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @orphan_loop
+func.func @orphan_loop(%buf: memref<8xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c0_i32 = arith.constant 0 : i32
+
+ // CHECK-NOT: acc.loop
+ // CHECK: scf.for
+ // CHECK-NOT: scf.parallel
+ acc.loop control(%i : index) = (%c0 : index) to (%c8 : index) step (%c1 : index) {
+ memref.store %c0_i32, %buf[%i] : memref<8xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>]}
+ return
+}
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir
new file mode 100644
index 0000000000000..f22d7872ecc32
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// CHECK-LABEL: func.func @parallel_unstructured_loop
+func.func @parallel_unstructured_loop(%buf: memref<10xi32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c10 = arith.constant 10 : index
+ %c1_i32 = arith.constant 1 : i32
+
+ %dev = acc.copyin varPtr(%buf : memref<10xi32>) -> memref<10xi32>
+ // CHECK-NOT: acc.loop
+ // CHECK: acc.kernel_environment
+ // CHECK-NOT: acc.par_width
+ // CHECK: acc.compute_region
+ // CHECK: scf.execute_region
+ acc.parallel dataOperands(%dev : memref<10xi32>) {
+ acc.loop {
+ ^entry:
+ cf.br ^header(%c0 : index)
+ ^header(%iv: index):
+ %cond = arith.cmpi ult, %iv, %c10 : index
+ cf.cond_br %cond, ^body, ^exit
+ ^body:
+ memref.store %c1_i32, %dev[%iv] : memref<10xi32>
+ %iv_next = arith.addi %iv, %c1 : index
+ cf.br ^header(%iv_next : index)
+ ^exit:
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>], unstructured}
+ acc.yield
+ }
+ acc.copyout accPtr(%dev : memref<10xi32>) to varPtr(%buf : memref<10xi32>)
+ return
+}
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
index b2d5409f495f5..d56054a0b5877 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
@@ -7,9 +7,14 @@
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OwningOpRef.h"
#include "gtest/gtest.h"
@@ -24,7 +29,9 @@ using namespace mlir::acc;
class OpenACCUtilsCGTest : public ::testing::Test {
protected:
OpenACCUtilsCGTest() : b(&context), loc(UnknownLoc::get(&context)) {
- context.loadDialect<acc::OpenACCDialect, DLTIDialect>();
+ context.loadDialect<acc::OpenACCDialect, arith::ArithDialect,
+ func::FuncDialect, scf::SCFDialect, gpu::GPUDialect,
+ DLTIDialect>();
}
MLIRContext context;
@@ -74,3 +81,71 @@ TEST_F(OpenACCUtilsCGTest, getDataLayoutWithSpec) {
auto dl2 = getDataLayout(module->getOperation(), /*allowDefault=*/true);
EXPECT_TRUE(dl2.has_value());
}
+
+//===----------------------------------------------------------------------===//
+// buildComputeRegion Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsCGTest, buildComputeRegionEmpty) {
+ OwningOpRef<ModuleOp> module = ModuleOp::create(b, loc);
+ IRRewriter rewriter(&context);
+ rewriter.setInsertionPointToEnd(module->getBody());
+
+ auto funcTy = b.getFunctionType({}, {});
+ auto func = func::FuncOp::create(rewriter, loc, "test", funcTy);
+ Block *entry = func.addEntryBlock();
+ rewriter.setInsertionPointToStart(entry);
+
+ Region sourceRegion;
+ Block *srcBlock = new Block();
+ sourceRegion.push_back(srcBlock);
+ OpBuilder srcBuilder(&context);
+ srcBuilder.setInsertionPointToStart(srcBlock);
+ YieldOp::create(srcBuilder, loc);
+
+ IRMapping mapping;
+ auto cr =
+ buildComputeRegion(loc, /*launchArgs=*/{}, /*inputArgs=*/{},
+ SerialOp::getOperationName(), sourceRegion, rewriter,
+ mapping);
+
+ EXPECT_EQ(cr.getOrigin(), SerialOp::getOperationName());
+ EXPECT_EQ(cr.getLaunchArgs().size(), 0u);
+ EXPECT_EQ(cr.getInputArgs().size(), 0u);
+ EXPECT_TRUE(cr.getRegion().hasOneBlock());
+
+ func::ReturnOp::create(rewriter, loc);
+}
+
+TEST_F(OpenACCUtilsCGTest, buildComputeRegionWithLaunchArgs) {
+ OwningOpRef<ModuleOp> module = ModuleOp::create(b, loc);
+ IRRewriter rewriter(&context);
+ rewriter.setInsertionPointToEnd(module->getBody());
+
+ auto funcTy = b.getFunctionType({}, {});
+ auto func = func::FuncOp::create(rewriter, loc, "test", funcTy);
+ Block *entry = func.addEntryBlock();
+ rewriter.setInsertionPointToStart(entry);
+
+ auto c128 = arith::ConstantIndexOp::create(rewriter, loc, 128);
+ auto threadXDim = GPUParallelDimAttr::threadXDim(&context);
+ auto pw = ParWidthOp::create(rewriter, loc, c128, threadXDim);
+
+ Region sourceRegion;
+ Block *srcBlock = new Block();
+ sourceRegion.push_back(srcBlock);
+ OpBuilder srcBuilder(&context);
+ srcBuilder.setInsertionPointToStart(srcBlock);
+ YieldOp::create(srcBuilder, loc);
+
+ IRMapping mapping;
+ auto cr = buildComputeRegion(loc, {pw}, /*inputArgs=*/{},
+ ParallelOp::getOperationName(), sourceRegion,
+ rewriter, mapping);
+
+ EXPECT_EQ(cr.getOrigin(), ParallelOp::getOperationName());
+ EXPECT_EQ(cr.getLaunchArgs().size(), 1u);
+ EXPECT_EQ(cr.getLaunchArgs()[0], pw.getResult());
+
+ func::ReturnOp::create(rewriter, loc);
+}
>From 28a10ff67b56cffb0e10b24eb31aa0916cdefb50 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Mon, 9 Mar 2026 12:55:59 -0700
Subject: [PATCH 2/3] Fix format
---
mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h | 2 +-
mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 5 +++--
mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp | 11 +++++------
3 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
index b5fdcca761a4a..ba453b11492f3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
@@ -15,8 +15,8 @@
#define MLIR_DIALECT_OPENACC_OPENACCUTILSCG_H_
#include "mlir/Dialect/OpenACC/OpenACC.h"
-#include "mlir/Interfaces/DataLayoutInterfaces.h"
#include "mlir/IR/IRMapping.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
#include <optional>
namespace mlir {
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index ba677082ba4e2..d57c65694da61 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -219,8 +219,9 @@ void KernelEnvironmentOp::getCanonicalizationPatterns(
}
template <typename ComputeConstructT>
-KernelEnvironmentOp KernelEnvironmentOp::createAndPopulate(
- ComputeConstructT computeConstruct, OpBuilder &builder) {
+KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate(ComputeConstructT computeConstruct,
+ OpBuilder &builder) {
auto kernelEnvironment = KernelEnvironmentOp::create(
builder, computeConstruct->getLoc(),
computeConstruct.getDataClauseOperands(),
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
index d56054a0b5877..671fa6c5560eb 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
@@ -104,10 +104,9 @@ TEST_F(OpenACCUtilsCGTest, buildComputeRegionEmpty) {
YieldOp::create(srcBuilder, loc);
IRMapping mapping;
- auto cr =
- buildComputeRegion(loc, /*launchArgs=*/{}, /*inputArgs=*/{},
- SerialOp::getOperationName(), sourceRegion, rewriter,
- mapping);
+ auto cr = buildComputeRegion(loc, /*launchArgs=*/{}, /*inputArgs=*/{},
+ SerialOp::getOperationName(), sourceRegion,
+ rewriter, mapping);
EXPECT_EQ(cr.getOrigin(), SerialOp::getOperationName());
EXPECT_EQ(cr.getLaunchArgs().size(), 0u);
@@ -140,8 +139,8 @@ TEST_F(OpenACCUtilsCGTest, buildComputeRegionWithLaunchArgs) {
IRMapping mapping;
auto cr = buildComputeRegion(loc, {pw}, /*inputArgs=*/{},
- ParallelOp::getOperationName(), sourceRegion,
- rewriter, mapping);
+ ParallelOp::getOperationName(), sourceRegion,
+ rewriter, mapping);
EXPECT_EQ(cr.getOrigin(), ParallelOp::getOperationName());
EXPECT_EQ(cr.getLaunchArgs().size(), 1u);
>From b3b0985ea6e86c9041e151380d0730ecbb666829 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Tue, 10 Mar 2026 10:46:42 -0700
Subject: [PATCH 3/3] Convert to scf.parallel in specialized routine
---
mlir/include/mlir/Dialect/OpenACC/OpenACC.h | 4 +--
.../OpenACC/Transforms/ACCComputeLowering.cpp | 5 +++-
.../OpenACC/acc-compute-lowering-loop.mlir | 25 +++++++++++++++++++
3 files changed, 31 insertions(+), 3 deletions(-)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index fe5b42807236f..55fc8251a9bbd 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -189,13 +189,13 @@ static constexpr StringLiteral getSpecializedRoutineAttrName() {
/// Used to check whether the current operation is marked with
/// `acc routine`. The operation passed in should be a function.
inline bool isAccRoutine(mlir::Operation *op) {
- return op->hasAttr(mlir::acc::getRoutineInfoAttrName());
+ return op && op->hasAttr(mlir::acc::getRoutineInfoAttrName());
}
/// Used to check whether this is a specialized accelerator version of
/// `acc routine` function.
inline bool isSpecializedAccRoutine(mlir::Operation *op) {
- return op->hasAttr(mlir::acc::getSpecializedRoutineAttrName());
+ return op && op->hasAttr(mlir::acc::getSpecializedRoutineAttrName());
}
static constexpr StringLiteral getFromDefaultClauseAttrName() {
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
index 8bb5dc9eb43d7..60787dd9c0be5 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
@@ -50,6 +50,7 @@
#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/IR/IRMapping.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/RegionUtils.h"
@@ -252,7 +253,9 @@ class ACCLoopConversion : public OpRewritePattern<LoopOp> {
if (!forOp)
return failure();
rewriter.replaceOp(loopOp, forOp);
- } else if (!isOpInComputeRegion(loopOp)) {
+ } else if (!isOpInComputeRegion(loopOp) &&
+ !isSpecializedAccRoutine(
+ loopOp->getParentOfType<FunctionOpInterface>())) {
// This loop is an orphan `acc loop` but it is not in any sort
// of compute region. Thus it is just a sequential non-accelerator loop.
auto forOp =
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
index 4a5ea390233e2..69f9e748a92d7 100644
--- a/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
@@ -128,3 +128,28 @@ func.func @orphan_loop(%buf: memref<8xi32>) {
} attributes {independent = [#acc.device_type<none>]}
return
}
+
+// -----
+
+// Loop in specialized acc routine: should not be treated as orphan (scf.for)
+// but converted to scf.parallel when independent. With vector tag, the
+// scf.parallel gets acc.par_dims = thread_x (vector dimension).
+acc.routine @routine_with_loop func(@device_routine_with_loop) seq
+// CHECK-LABEL: func.func @device_routine_with_loop
+// CHECK: attributes {acc.specialized_routine = #acc.specialized_routine<@routine_with_loop, <seq>, "host_routine_with_loop">}
+// CHECK-NOT: acc.loop
+// CHECK: scf.parallel
+// CHECK: acc.par_dims = #acc<par_dims[thread_x]>
+// CHECK-NOT: scf.for
+func.func @device_routine_with_loop(%buf: memref<8xi32>) attributes {acc.specialized_routine = #acc.specialized_routine<@routine_with_loop, <seq>, "host_routine_with_loop">} {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8 = arith.constant 8 : index
+ %c0_i32 = arith.constant 0 : i32
+
+ acc.loop control(%i : index) = (%c0 : index) to (%c8 : index) step (%c1 : index) {
+ memref.store %c0_i32, %buf[%i] : memref<8xi32>
+ acc.yield
+ } attributes {independent = [#acc.device_type<none>], vector = [#acc.device_type<none>]}
+ return
+}
More information about the Mlir-commits
mailing list