[Mlir-commits] [mlir] [mlir][acc] Add acc.compute_region and acc.par_width operations (PR #184864)
Razvan Lupusoru
llvmlistbot at llvm.org
Thu Mar 5 12:57:52 PST 2026
https://github.com/razvanlupusoru updated https://github.com/llvm/llvm-project/pull/184864
>From 77c688242fa938eb9fda86f0cefbd8d131659036 Mon Sep 17 00:00:00 2001
From: Scott Manley <rscottmanley at gmail.com>
Date: Thu, 5 Mar 2026 11:17:22 -0800
Subject: [PATCH 1/3] [mlir][acc] Add acc.compute_region and acc.par_width
operations
Introduce two new codegen operations to the acc dialect that model
GPU compute region execution and parallel launch configuration:
- acc.par_width: specifies a parallel dimension.
- acc.compute_region: wraps a region of code for GPU execution,
capturing
launch configuration (from acc.par_width results) and input values as
block arguments.
These operations bridge the gap between high-level OpenACC compute
constructs (acc.parallel, acc.kernels, acc.serial) and gpu.launch.
The passes that do these transformations will soon follow.
---
.../mlir/Dialect/OpenACC/OpenACCCGOps.td | 153 +++++++++++
.../mlir/Dialect/OpenACC/OpenACCOps.td | 2 +-
mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 243 ++++++++++++++++++
mlir/test/Dialect/OpenACC/invalid-cg.mlir | 19 ++
mlir/test/Dialect/OpenACC/ops-cg.mlir | 194 ++++++++++++++
5 files changed, 610 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
index f179cfd752c62..be2a0ff3ec028 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
@@ -210,4 +210,157 @@ def OpenACC_FirstprivateMapInitialOp
let extraClassDeclaration = extraClassDeclarationBase;
}
+//===----------------------------------------------------------------------===//
+// acc.par_width
+//===----------------------------------------------------------------------===//
+
+def OpenACC_ParWidthOp
+ : OpenACC_Op<"par_width", [NoMemoryEffect, AlwaysSpeculatable]> {
+ let summary = "Specify parallel width for a GPU dimension";
+ let description = [{
+ The `acc.par_width` operation specifies the parallel width for a
+ given GPU parallel dimension. It is used as an input to
+ `acc.compute_region` to define the launch configuration.
+
+ The optional `launchArg` operand provides a known width value. When
+ absent, the width is unknown and must be determined later (either at
+ compile time by analysis or at runtime).
+
+ Examples:
+
+ ```mlir
+ // Known width from SSA value
+ %w1 = acc.par_width %vector_len {par_dim = #acc.par_dim<thread_x>}
+
+ // Unknown width (to be computed later)
+ %w2 = acc.par_width {par_dim = #acc.par_dim<block_x>}
+ ```
+ }];
+ let arguments = (ins Optional<Index>:$launchArg,
+ OpenACC_GPUParallelDimAttr:$par_dim);
+ let results = (outs Index:$output);
+ let assemblyFormat = [{
+ ($launchArg^)? attr-dict
+ }];
+}
+
+//===----------------------------------------------------------------------===//
+// acc.compute_region
+//===----------------------------------------------------------------------===//
+
+// Local type constraint for gpu::AsyncTokenType.
+def OpenACC_GPUAsyncTokenType : Type<
+ CPred<"::llvm::isa<::mlir::gpu::AsyncTokenType>($_self)">,
+ "GPU async token type">;
+
+def OpenACC_ComputeRegionOp
+ : OpenACC_Op<"compute_region",
+ [OffloadRegionOpInterface, AffineScope,
+ RecursiveMemoryEffects,
+ SingleBlockImplicitTerminator<"YieldOp">,
+ IsolatedFromAbove, AttrSizedOperandSegments]> {
+ let summary = "Compute region for GPU execution";
+ let description = [{
+ The `acc.compute_region` operation wraps a region of code that will be
+ compiled and executed on a GPU. It is typically produced by lowering
+ OpenACC compute constructs (`acc.parallel`, `acc.kernels`, `acc.serial`)
+ but can also be targeted directly by other frontends or lowered from
+ other constructs that benefit from the automatic parallelization and data
+ mapping facilities that the `acc` dialect provides. It serves as the
+ bridge between the high-level representation and the `gpu.launch`
+ operation.
+
+ The operation is `IsolatedFromAbove`: all values used inside the
+ region must be explicitly captured. Values are captured in two ways:
+
+ - Launch arguments (`launch`): Results of `acc.par_width`
+ operations that define the parallel launch configuration. These
+ become `index`-typed block arguments representing the parallel
+ width for each dimension.
+
+ - Input arguments (`ins`): Arbitrary values captured from outside
+ the region (data pointers, scalars, etc.). These become block
+ arguments with their original types.
+
+ The `origin` attribute records which construct produced this compute
+ region (e.g., `"acc.parallel"`, `"acc.kernels"`). This is intended to
+ be solely informational.
+
+ Example:
+
+ ```mlir
+ %w0 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+ %w1 = acc.par_width %c8 {par_dim = #acc.par_dim<block_x>}
+ acc.compute_region launch(%arg0 = %w0, %arg1 = %w1)
+ ins(%arg2 = %data) : (memref<1024xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c1024 = arith.constant 1024 : index
+ scf.parallel (%iv) = (%c0) to (%c1024) step (%c1) {
+ %v = memref.load %arg2[%iv] : memref<1024xf32>
+ scf.reduce
+ } {acc.par_dims = #acc<par_dims[thread_x]>}
+ acc.yield
+ } {origin = "acc.parallel"}
+ ```
+ }];
+
+ let arguments = (ins Variadic<Index>:$launchArgs,
+ Variadic<AnyType>:$inputArgs,
+ Optional<OpenACC_GPUAsyncTokenType>:$stream,
+ StrAttr:$origin,
+ OptionalAttr<FlatSymbolRefAttr>:$kernel_func_name,
+ OptionalAttr<FlatSymbolRefAttr>:$kernel_module_name);
+
+ let results = (outs Variadic<AnyType>:$results);
+
+ let regions = (region AnyRegion:$region);
+
+ let extraClassDeclaration = [{
+ /// Look up the par_width op for the given dimension among launch args.
+ std::optional<mlir::Value> getLaunchArg(
+ ::mlir::acc::GPUParallelDimAttr parDim);
+
+ /// Get the known (non-empty) launch value for a dimension.
+ std::optional<mlir::Value> getKnownLaunchArg(
+ ::mlir::acc::GPUParallelDimAttr parDim);
+
+ /// Get the known constant launch value for a dimension.
+ std::optional<uint64_t> getKnownConstantLaunchArg(
+ ::mlir::acc::GPUParallelDimAttr parDim);
+
+ /// Add a new input argument, appending to both the operand list and
+ /// the region block arguments. Returns the new block argument.
+ ::mlir::BlockArgument appendInputArg(::mlir::Value);
+
+ /// Check whether all parallel dimensions have width 1.
+ bool isEffectivelySerial();
+
+ /// Get the block argument representing the width for a given dimension.
+ ::mlir::BlockArgument parDimToWidth(
+ ::mlir::acc::GPUParallelDimAttr parDim);
+
+ /// Get the block argument for a specific gpu::Processor.
+ ::mlir::BlockArgument gpuParWidth(::mlir::gpu::Processor);
+
+ /// Collect all GPU parallel dimensions present in the launch config.
+ llvm::SmallVector<::mlir::acc::GPUParallelDimAttr> getLaunchParDims();
+
+ /// Get the body block of the compute region.
+ ::mlir::Block *getBody() { return &getRegion().front(); }
+
+ /// Get the terminator of the compute region.
+ ::mlir::Operation *getTerminator() {
+ return &getRegion().back().back();
+ }
+
+ /// Map a block argument back to its corresponding operand
+ /// ($launchArgs or $inputArgs).
+ ::mlir::Value getOperand(::mlir::BlockArgument blockArg);
+ }];
+
+ let hasVerifier = 1;
+ let hasCustomAssemblyFormat = 1;
+}
+
#endif // OPENACC_CG_OPS
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2bb1654cb6369..33d3b84b32b98 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2845,7 +2845,7 @@ def OpenACC_LoopOp
def OpenACC_YieldOp : OpenACC_Op<"yield", [Pure, ReturnLike, Terminator,
ParentOneOf<["FirstprivateRecipeOp, LoopOp, ParallelOp, PrivateRecipeOp,"
"ReductionRecipeOp, ReductionInitOp, ReductionCombineRegionOp,"
- "SerialOp, AtomicUpdateOp"]>]> {
+ "SerialOp, AtomicUpdateOp, ComputeRegionOp"]>]> {
let summary = "Acc yield and termination operation";
let description = [{
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index 881234cdd6198..6b0e1c07670ac 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -16,6 +16,7 @@
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Region.h"
@@ -325,6 +326,248 @@ void ReductionCombineOp::getEffects(
SideEffects::DefaultResource::get());
}
+//===----------------------------------------------------------------------===//
+// ComputeRegionOp
+//===----------------------------------------------------------------------===//
+
+static ParWidthOp getParWidthOpForLaunchArg(ComputeRegionOp op,
+ GPUParallelDimAttr parDim) {
+ for (auto launchArg : op.getLaunchArgs()) {
+ auto parOp = launchArg.getDefiningOp<ParWidthOp>();
+ if (!parOp)
+ continue;
+ auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
+ if (launchArgDim == parDim)
+ return parOp;
+ }
+ return nullptr;
+}
+
+std::optional<Value>
+ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
+ if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
+ return parWidthOp.getResult();
+ return {};
+}
+
+std::optional<Value>
+ComputeRegionOp::getKnownLaunchArg(GPUParallelDimAttr parDim) {
+ if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
+ if (parWidthOp.getLaunchArg())
+ return parWidthOp.getLaunchArg();
+ return {};
+}
+
+std::optional<uint64_t>
+ComputeRegionOp::getKnownConstantLaunchArg(GPUParallelDimAttr parDim) {
+ auto knownParWidth = getKnownLaunchArg(parDim);
+ if (knownParWidth.has_value())
+ return getConstantIntValue(knownParWidth.value());
+ return {};
+}
+
+BlockArgument ComputeRegionOp::appendInputArg(Value value) {
+ getInputArgsMutable().append(value);
+ return getBody()->addArgument(value.getType(), getLoc());
+}
+
+bool ComputeRegionOp::isEffectivelySerial() {
+ auto *ctx = getContext();
+
+ if (getLaunchArg(GPUParallelDimAttr::seqDim(ctx)))
+ return true;
+
+ auto checkDim = [&](GPUParallelDimAttr dim) -> bool {
+ auto val = getKnownConstantLaunchArg(dim);
+ return val && *val == 1;
+ };
+
+ return checkDim(GPUParallelDimAttr::threadXDim(ctx)) &&
+ checkDim(GPUParallelDimAttr::threadYDim(ctx)) &&
+ checkDim(GPUParallelDimAttr::threadZDim(ctx)) &&
+ checkDim(GPUParallelDimAttr::blockXDim(ctx)) &&
+ checkDim(GPUParallelDimAttr::blockYDim(ctx)) &&
+ checkDim(GPUParallelDimAttr::blockZDim(ctx));
+}
+
+BlockArgument ComputeRegionOp::parDimToWidth(GPUParallelDimAttr parDim) {
+ for (auto [pos, launchArg] : llvm::enumerate(getLaunchArgs())) {
+ auto parOp = launchArg.getDefiningOp<ParWidthOp>();
+ assert(parOp);
+ auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
+ if (launchArgDim == parDim) {
+ assert(pos < getRegion().front().getNumArguments() &&
+ "launch arg position out of range");
+ return getRegion().front().getArgument(pos);
+ }
+ }
+ llvm_unreachable("attempting to get unspecified parDim");
+}
+
+SmallVector<GPUParallelDimAttr> ComputeRegionOp::getLaunchParDims() {
+ SmallVector<GPUParallelDimAttr> parDims;
+ for (auto launchArg : getLaunchArgs()) {
+ auto parOp = launchArg.getDefiningOp<ParWidthOp>();
+ auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
+ int64_t dimInt = launchArgDim.getValue().getInt();
+ parDims.push_back(intToParDim(getContext(), dimInt));
+ }
+ return parDims;
+}
+
+Value ComputeRegionOp::getOperand(BlockArgument blockArg) {
+ unsigned argNumber = blockArg.getArgNumber();
+ unsigned numLaunchArgs = getLaunchArgs().size();
+ unsigned numInputArgs = getInputArgs().size();
+ assert(argNumber < (numLaunchArgs + numInputArgs) &&
+ "invalid block argument");
+ if (argNumber < numLaunchArgs)
+ return getLaunchArgs()[argNumber];
+ return getInputArgs()[argNumber - numLaunchArgs];
+}
+
+BlockArgument ComputeRegionOp::gpuParWidth(gpu::Processor processor) {
+ return parDimToWidth(GPUParallelDimAttr::get(getContext(), processor));
+}
+
+LogicalResult ComputeRegionOp::verify() {
+ for (auto op : getLaunchArgs())
+ if (!op.getDefiningOp<acc::ParWidthOp>())
+ return emitOpError(
+ "launch arguments must be results of acc.par_width operations");
+
+ unsigned expectedBlockArgs =
+ getLaunchArgs().size() + getInputArgs().size();
+ unsigned actualBlockArgs = getRegion().front().getNumArguments();
+ if (expectedBlockArgs != actualBlockArgs)
+ return emitOpError("expected ")
+ << expectedBlockArgs << " block arguments (launch + input), got "
+ << actualBlockArgs;
+
+ return success();
+}
+
+void ComputeRegionOp::print(OpAsmPrinter &p) {
+ ValueRange regionArgs = getBody()->getArguments();
+ ValueRange launchArgs = getLaunchArgs();
+ ValueRange inputArgs = getInputArgs();
+
+ assert(regionArgs.size() == (launchArgs.size() + inputArgs.size()) &&
+ "region args mismatch");
+
+ if (getStream())
+ p << " stream(" << getStream() << " : " << getStream().getType() << ")";
+
+ size_t i = 0;
+ if (!launchArgs.empty()) {
+ p << " launch(";
+ for (size_t j = 0; j < launchArgs.size(); ++j, ++i) {
+ p << regionArgs[i] << " = " << launchArgs[j];
+ if (j < launchArgs.size() - 1)
+ p << ", ";
+ }
+ p << ")";
+ }
+ if (!inputArgs.empty()) {
+ p << " ins(";
+ for (size_t j = 0; j < inputArgs.size(); ++j, ++i) {
+ p << regionArgs[i] << " = " << inputArgs[j];
+ if (j < inputArgs.size() - 1)
+ p << ", ";
+ }
+ p << ") : (";
+ for (size_t j = 0; j < inputArgs.size(); ++j) {
+ p << inputArgs[j].getType();
+ if (j < inputArgs.size() - 1)
+ p << ", ";
+ }
+ p << ")";
+ }
+ p.printOptionalArrowTypeList(getResultTypes());
+ p << " ";
+ p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+ p.printOptionalAttrDict((*this)->getAttrs(),
+ /*elidedAttrs=*/getOperandSegmentSizeAttr());
+}
+
+ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
+ OperationState &result) {
+ auto &builder = parser.getBuilder();
+
+ SmallVector<OpAsmParser::Argument> regionArgs;
+ OpAsmParser::UnresolvedOperand streamOperand;
+ Type streamType;
+ SmallVector<OpAsmParser::UnresolvedOperand> launchOperands;
+ SmallVector<OpAsmParser::UnresolvedOperand> inputOperands;
+ SmallVector<Type> types;
+
+ bool hasStream = false;
+ if (succeeded(parser.parseOptionalKeyword("stream"))) {
+ hasStream = true;
+ if (parser.parseLParen() || parser.parseOperand(streamOperand) ||
+ parser.parseColon() || parser.parseType(streamType) ||
+ parser.parseRParen())
+ return failure();
+ }
+
+ if (succeeded(parser.parseOptionalKeyword("launch"))) {
+ if (parser.parseAssignmentList(regionArgs, launchOperands))
+ return failure();
+ Type indexType = builder.getIndexType();
+ for (size_t i = 0; i < regionArgs.size(); ++i)
+ types.push_back(indexType);
+ }
+
+ if (succeeded(parser.parseOptionalKeyword("ins"))) {
+ if (parser.parseAssignmentList(regionArgs, inputOperands) ||
+ parser.parseColon() || parser.parseLParen() ||
+ parser.parseTypeList(types) || parser.parseRParen())
+ return failure();
+ }
+
+ if (parser.parseOptionalArrowTypeList(result.types))
+ return failure();
+
+ for (auto [iterArg, type] : llvm::zip_equal(regionArgs, types))
+ iterArg.type = type;
+
+ Region *body = result.addRegion();
+ if (parser.parseRegion(*body, regionArgs))
+ return failure();
+
+ const size_t numLaunchOperands = launchOperands.size();
+ const size_t numInputOperands = inputOperands.size();
+ assert(numLaunchOperands + numInputOperands == regionArgs.size() &&
+ "compute region args mismatch");
+
+ result.addAttribute(ComputeRegionOp::getOperandSegmentSizeAttr(),
+ builder.getDenseI32ArrayAttr(
+ {static_cast<int32_t>(numLaunchOperands),
+ static_cast<int32_t>(numInputOperands),
+ hasStream ? 1 : 0}));
+
+ for (size_t i = 0; i < numLaunchOperands; ++i) {
+ if (parser.resolveOperand(launchOperands[i], types[i], result.operands))
+ return failure();
+ }
+
+ for (size_t i = numLaunchOperands; i < regionArgs.size(); ++i) {
+ if (parser.resolveOperand(inputOperands[i - numLaunchOperands], types[i],
+ result.operands))
+ return failure();
+ }
+
+ if (hasStream) {
+ if (parser.resolveOperand(streamOperand, streamType, result.operands))
+ return failure();
+ }
+
+ if (parser.parseOptionalAttrDict(result.attributes))
+ return failure();
+
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// GPUParallelDimAttr
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenACC/invalid-cg.mlir b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
index bc2408ceafe8a..d218bc505a5ea 100644
--- a/mlir/test/Dialect/OpenACC/invalid-cg.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
@@ -19,3 +19,22 @@ scf.parallel (%iv) = (%c0_2) to (%c4_2) step (%c1_2) {
scf.reduce
// expected-error at +1 {{expected one of ::mlir::gpu::Processor enum names}}
} {acc.par_dims = #acc<par_dims[gang]>}
+
+// -----
+
+%c32 = arith.constant 32 : index
+// expected-error at +1 {{'acc.compute_region' op launch arguments must be results of acc.par_width operations}}
+acc.compute_region launch(%arg0 = %c32) {
+ acc.yield
+} {origin = "acc.parallel"}
+
+// -----
+
+// Use generic form to introduce an extra block argument.
+%c64 = arith.constant 64 : index
+%w = acc.par_width %c64 {par_dim = #acc.par_dim<thread_x>}
+// expected-error at +1 {{'acc.compute_region' op expected 1 block arguments (launch + input), got 2}}
+"acc.compute_region"(%w) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+^bb0(%arg0: index, %extra: index):
+ "acc.yield"() : () -> ()
+}) {origin = "acc.parallel"} : (index) -> ()
diff --git a/mlir/test/Dialect/OpenACC/ops-cg.mlir b/mlir/test/Dialect/OpenACC/ops-cg.mlir
index e6453da21ed79..7a61261d97ba9 100644
--- a/mlir/test/Dialect/OpenACC/ops-cg.mlir
+++ b/mlir/test/Dialect/OpenACC/ops-cg.mlir
@@ -77,3 +77,197 @@ func.func @par_dims_2d_grid() {
return
}
// CHECK: acc.par_dims = #acc<par_dims[block_y, thread_y]>
+
+// -----
+
+// CHECK-LABEL: func @compute_region_single_dim
+func.func @compute_region_single_dim(%data: memref<1024xf32>,
+ %result: memref<f32>) {
+ %c128 = arith.constant 128 : index
+ %copyin = acc.copyin varPtr(%data : memref<1024xf32>) -> memref<1024xf32>
+ %copy = acc.copyin varPtr(%result : memref<f32>) -> memref<f32> {dataClause = #acc<data_clause acc_copy>}
+ acc.kernel_environment dataOperands(%copyin, %copy : memref<1024xf32>, memref<f32>) {
+ %w0 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+ acc.compute_region launch(%arg0 = %w0)
+ ins(%arg1 = %copyin, %arg2 = %copy) : (memref<1024xf32>, memref<f32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c128_inner = arith.constant 128 : index
+ %cst = arith.constant 0.000000e+00 : f32
+ memref.store %cst, %arg2[] : memref<f32>
+ scf.parallel (%iv) = (%c0) to (%c128_inner) step (%c1) {
+ %val = memref.load %arg1[%iv] : memref<1024xf32>
+ %cur = memref.load %arg2[] : memref<f32>
+ %sum = arith.addf %cur, %val : f32
+ memref.store %sum, %arg2[] : memref<f32>
+ scf.reduce
+ } {acc.par_dims = #acc<par_dims[thread_x]>}
+ acc.yield
+ } {origin = "acc.parallel"}
+ }
+ acc.copyout accPtr(%copy : memref<f32>) to varPtr(%result : memref<f32>) {dataClause = #acc<data_clause acc_copy>}
+ acc.delete accPtr(%copyin : memref<1024xf32>)
+ return
+}
+// CHECK: %[[W:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W]]) ins({{.*}}) : (memref<1024xf32>, memref<f32>) {
+// CHECK: acc.yield
+// CHECK: } {origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_two_dims
+func.func @compute_region_two_dims(%data: memref<8xi32>,
+ %reduction_var: memref<i32>) {
+ %c8 = arith.constant 8 : index
+ %c128 = arith.constant 128 : index
+ %copyin_data = acc.copyin varPtr(%data : memref<8xi32>) -> memref<8xi32>
+ %copyin_red = acc.copyin varPtr(%reduction_var : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_reduction>}
+ acc.kernel_environment dataOperands(%copyin_data, %copyin_red : memref<8xi32>, memref<i32>) {
+ %w0 = acc.par_width %c8 {par_dim = #acc.par_dim<block_x>}
+ %w1 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+ acc.compute_region launch(%arg0 = %w0, %arg1 = %w1)
+ ins(%arg2 = %copyin_data, %arg3 = %copyin_red) : (memref<8xi32>, memref<i32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c8_inner = arith.constant 8 : index
+ %c0_i32 = arith.constant 0 : i32
+ %init = acc.reduction_init %arg3 <add> : memref<i32> {
+ %alloca = memref.alloca() : memref<i32>
+ memref.store %c0_i32, %alloca[] : memref<i32>
+ acc.yield %alloca : memref<i32>
+ }
+ scf.parallel (%iv) = (%c0) to (%c8_inner) step (%c1) {
+ %v = memref.load %arg2[%iv] : memref<8xi32>
+ %cur = memref.load %init[] : memref<i32>
+ %sum = arith.addi %cur, %v : i32
+ memref.store %sum, %init[] : memref<i32>
+ scf.reduce
+ } {acc.par_dims = #acc<par_dims[block_x, thread_x]>}
+ acc.reduction_combine %init into %arg3 <add> : memref<i32>
+ acc.yield
+ } {origin = "acc.parallel"}
+ }
+ acc.copyout accPtr(%copyin_red : memref<i32>) to varPtr(%reduction_var : memref<i32>) {dataClause = #acc<data_clause acc_reduction>}
+ acc.delete accPtr(%copyin_data : memref<8xi32>)
+ return
+}
+// CHECK: %[[W0:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<block_x>}
+// CHECK: %[[W1:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W0]], %{{.*}} = %[[W1]]) ins({{.*}}) : (memref<8xi32>, memref<i32>) {
+// CHECK: acc.yield
+// CHECK: } {origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_unknown_width
+func.func @compute_region_unknown_width(%data: memref<100xf32>) {
+ %copyin = acc.copyin varPtr(%data : memref<100xf32>) -> memref<100xf32>
+ acc.kernel_environment dataOperands(%copyin : memref<100xf32>) {
+ %w0 = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+ acc.compute_region launch(%arg0 = %w0)
+ ins(%arg1 = %copyin) : (memref<100xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c100 = arith.constant 100 : index
+ scf.parallel (%iv) = (%c0) to (%c100) step (%c1) {
+ scf.reduce
+ } {acc.par_dims = #acc<par_dims[thread_x]>}
+ acc.yield
+ } {origin = "acc.kernels"}
+ }
+ acc.delete accPtr(%copyin : memref<100xf32>)
+ return
+}
+// CHECK: %[[W:.*]] = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W]]) ins({{.*}}) : (memref<100xf32>) {
+// CHECK: acc.yield
+// CHECK: } {origin = "acc.kernels"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_no_launch
+func.func @compute_region_no_launch(%a: memref<i32>, %b: memref<i32>) {
+ %copy_a = acc.copyin varPtr(%a : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_copy>}
+ %copy_b = acc.copyin varPtr(%b : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_copy>}
+ acc.kernel_environment dataOperands(%copy_a, %copy_b : memref<i32>, memref<i32>) {
+ acc.compute_region
+ ins(%arg0 = %copy_a, %arg1 = %copy_b) : (memref<i32>, memref<i32>) {
+ %c1 = arith.constant 1 : i32
+ memref.store %c1, %arg0[] : memref<i32>
+ memref.store %c1, %arg1[] : memref<i32>
+ acc.yield
+ } {origin = "acc.serial"}
+ }
+ acc.copyout accPtr(%copy_a : memref<i32>) to varPtr(%a : memref<i32>) {dataClause = #acc<data_clause acc_copy>}
+ acc.copyout accPtr(%copy_b : memref<i32>) to varPtr(%b : memref<i32>) {dataClause = #acc<data_clause acc_copy>}
+ return
+}
+// CHECK: acc.compute_region ins({{.*}}) : (memref<i32>, memref<i32>) {
+// CHECK: acc.yield
+// CHECK: } {origin = "acc.serial"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_launch_only
+func.func @compute_region_launch_only() {
+ %c32 = arith.constant 32 : index
+ %w0 = acc.par_width %c32 {par_dim = #acc.par_dim<thread_x>}
+ acc.compute_region launch(%arg0 = %w0) {
+ acc.yield
+ } {origin = "acc.parallel"}
+ return
+}
+// CHECK: %[[W:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W]]) {
+// CHECK: acc.yield
+// CHECK: } {origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_all_fields
+// CHECK-SAME: (%{{.*}}: memref<1024xf32>, %[[STREAM:.*]]: !gpu.async.token)
+func.func @compute_region_all_fields(%data: memref<1024xf32>,
+ %stream: !gpu.async.token) {
+ %c128 = arith.constant 128 : index
+ %c8 = arith.constant 8 : index
+ %copyin = acc.copyin varPtr(%data : memref<1024xf32>) -> memref<1024xf32>
+ acc.kernel_environment dataOperands(%copyin : memref<1024xf32>) {
+ %w0 = acc.par_width %c8 {par_dim = #acc.par_dim<block_x>}
+ %w1 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+ acc.compute_region stream(%stream : !gpu.async.token)
+ launch(%arg0 = %w0, %arg1 = %w1)
+ ins(%arg2 = %copyin) : (memref<1024xf32>) {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c1024 = arith.constant 1024 : index
+ scf.parallel (%iv) = (%c0) to (%c1024) step (%c1) {
+ scf.reduce
+ } {acc.par_dims = #acc<par_dims[block_x, thread_x]>}
+ acc.yield
+ } {kernel_func_name = @compute_kernel, kernel_module_name = @device_module, origin = "acc.parallel"}
+ }
+ acc.delete accPtr(%copyin : memref<1024xf32>)
+ return
+}
+// CHECK: %[[W0:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<block_x>}
+// CHECK: %[[W1:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region stream(%[[STREAM]] : !gpu.async.token) launch(%{{.*}} = %[[W0]], %{{.*}} = %[[W1]]) ins({{.*}}) : (memref<1024xf32>) {
+// CHECK: acc.yield
+// CHECK: } {kernel_func_name = @compute_kernel, kernel_module_name = @device_module, origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_with_results
+func.func @compute_region_with_results() -> i32 {
+ %w0 = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+ %0 = acc.compute_region launch(%arg0 = %w0) -> i32 {
+ %c0_i32 = arith.constant 0 : i32
+ acc.yield %c0_i32 : i32
+ } {origin = "acc.parallel"}
+ return %0 : i32
+}
+// CHECK: %[[W:.*]] = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+// CHECK: {{.*}} = acc.compute_region launch(%{{.*}} = %[[W]]) -> i32 {
+// CHECK: acc.yield
+// CHECK: } {origin = "acc.parallel"}
>From ebb29dbeafcc62a437f8775599173ad0647503d4 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 5 Mar 2026 11:28:37 -0800
Subject: [PATCH 2/3] Fix format
---
mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index 6b0e1c07670ac..5de3e29800066 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -343,8 +343,7 @@ static ParWidthOp getParWidthOpForLaunchArg(ComputeRegionOp op,
return nullptr;
}
-std::optional<Value>
-ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
+std::optional<Value> ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
return parWidthOp.getResult();
return {};
@@ -436,8 +435,7 @@ LogicalResult ComputeRegionOp::verify() {
return emitOpError(
"launch arguments must be results of acc.par_width operations");
- unsigned expectedBlockArgs =
- getLaunchArgs().size() + getInputArgs().size();
+ unsigned expectedBlockArgs = getLaunchArgs().size() + getInputArgs().size();
unsigned actualBlockArgs = getRegion().front().getNumArguments();
if (expectedBlockArgs != actualBlockArgs)
return emitOpError("expected ")
@@ -540,11 +538,11 @@ ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
assert(numLaunchOperands + numInputOperands == regionArgs.size() &&
"compute region args mismatch");
- result.addAttribute(ComputeRegionOp::getOperandSegmentSizeAttr(),
- builder.getDenseI32ArrayAttr(
- {static_cast<int32_t>(numLaunchOperands),
- static_cast<int32_t>(numInputOperands),
- hasStream ? 1 : 0}));
+ result.addAttribute(
+ ComputeRegionOp::getOperandSegmentSizeAttr(),
+ builder.getDenseI32ArrayAttr({static_cast<int32_t>(numLaunchOperands),
+ static_cast<int32_t>(numInputOperands),
+ hasStream ? 1 : 0}));
for (size_t i = 0; i < numLaunchOperands; ++i) {
if (parser.resolveOperand(launchOperands[i], types[i], result.operands))
>From 68738721dc0647d275efd882f828bbd4985bb6de Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 5 Mar 2026 12:57:21 -0800
Subject: [PATCH 3/3] Add ParWidth type to capture par dimension info
---
mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td | 12 ++++++------
mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td | 8 ++++++++
mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 9 ++-------
mlir/test/Dialect/OpenACC/invalid-cg.mlir | 5 +++--
4 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
index be2a0ff3ec028..ebb0e6132fee3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
@@ -238,7 +238,7 @@ def OpenACC_ParWidthOp
}];
let arguments = (ins Optional<Index>:$launchArg,
OpenACC_GPUParallelDimAttr:$par_dim);
- let results = (outs Index:$output);
+ let results = (outs OpenACC_ParWidthType:$output);
let assemblyFormat = [{
($launchArg^)? attr-dict
}];
@@ -273,10 +273,10 @@ def OpenACC_ComputeRegionOp
The operation is `IsolatedFromAbove`: all values used inside the
region must be explicitly captured. Values are captured in two ways:
- - Launch arguments (`launch`): Results of `acc.par_width`
- operations that define the parallel launch configuration. These
- become `index`-typed block arguments representing the parallel
- width for each dimension.
+ - Launch arguments (`launch`): Results of operations that define
+ the parallel launch configuration. These are `!acc.par_width`-typed
+ and become block arguments representing the parallel width for each
+ dimension.
- Input arguments (`ins`): Arbitrary values captured from outside
the region (data pointers, scalars, etc.). These become block
@@ -305,7 +305,7 @@ def OpenACC_ComputeRegionOp
```
}];
- let arguments = (ins Variadic<Index>:$launchArgs,
+ let arguments = (ins Variadic<OpenACC_ParWidthType>:$launchArgs,
Variadic<AnyType>:$inputArgs,
Optional<OpenACC_GPUAsyncTokenType>:$stream,
StrAttr:$origin,
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td
index 117272693d626..bba385e69c0f2 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td
@@ -33,4 +33,12 @@ def OpenACC_DeclareTokenType : OpenACC_Type<"DeclareToken", "declare_token"> {
}];
}
+def OpenACC_ParWidthType : OpenACC_Type<"ParWidth", "par_width"> {
+ let summary = "parallel width token type";
+ let description = [{
+ Represents a type that is consumed by a compute region in order to
+ capture its parallelism dimensions arguments.
+ }];
+}
+
#endif // OPENACC_OPS_TYPES
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index 5de3e29800066..85dba70dbde1d 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -430,11 +430,6 @@ BlockArgument ComputeRegionOp::gpuParWidth(gpu::Processor processor) {
}
LogicalResult ComputeRegionOp::verify() {
- for (auto op : getLaunchArgs())
- if (!op.getDefiningOp<acc::ParWidthOp>())
- return emitOpError(
- "launch arguments must be results of acc.par_width operations");
-
unsigned expectedBlockArgs = getLaunchArgs().size() + getInputArgs().size();
unsigned actualBlockArgs = getRegion().front().getNumArguments();
if (expectedBlockArgs != actualBlockArgs)
@@ -511,9 +506,9 @@ ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
if (succeeded(parser.parseOptionalKeyword("launch"))) {
if (parser.parseAssignmentList(regionArgs, launchOperands))
return failure();
- Type indexType = builder.getIndexType();
+ auto parWidthType = acc::ParWidthType::get(builder.getContext());
for (size_t i = 0; i < regionArgs.size(); ++i)
- types.push_back(indexType);
+ types.push_back(parWidthType);
}
if (succeeded(parser.parseOptionalKeyword("ins"))) {
diff --git a/mlir/test/Dialect/OpenACC/invalid-cg.mlir b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
index d218bc505a5ea..f788e6c03bcc9 100644
--- a/mlir/test/Dialect/OpenACC/invalid-cg.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
@@ -22,8 +22,9 @@ scf.parallel (%iv) = (%c0_2) to (%c4_2) step (%c1_2) {
// -----
+// expected-note at +1 {{prior use here}}
%c32 = arith.constant 32 : index
-// expected-error at +1 {{'acc.compute_region' op launch arguments must be results of acc.par_width operations}}
+// expected-error at +1 {{use of value '%c32' expects different type than prior uses: '!acc.par_width' vs 'index'}}
acc.compute_region launch(%arg0 = %c32) {
acc.yield
} {origin = "acc.parallel"}
@@ -37,4 +38,4 @@ acc.compute_region launch(%arg0 = %c32) {
"acc.compute_region"(%w) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
^bb0(%arg0: index, %extra: index):
"acc.yield"() : () -> ()
-}) {origin = "acc.parallel"} : (index) -> ()
+}) {origin = "acc.parallel"} : (!acc.par_width) -> ()
More information about the Mlir-commits
mailing list