[Mlir-commits] [mlir] [mlir][acc] Add acc.compute_region and acc.par_width operations (PR #184864)

Thu Mar 5 12:57:52 PST 2026

https://github.com/razvanlupusoru updated https://github.com/llvm/llvm-project/pull/184864

>From 77c688242fa938eb9fda86f0cefbd8d131659036 Mon Sep 17 00:00:00 2001
From: Scott Manley <rscottmanley at gmail.com>
Date: Thu, 5 Mar 2026 11:17:22 -0800
Subject: [PATCH 1/3] [mlir][acc] Add acc.compute_region and acc.par_width
 operations

Introduce two new codegen operations to the acc dialect that model
GPU compute region execution and parallel launch configuration:
- acc.par_width: specifies a parallel dimension.
- acc.compute_region: wraps a region of code for GPU execution,
capturing
  launch configuration (from acc.par_width results) and input values as
  block arguments.

These operations bridge the gap between high-level OpenACC compute
constructs (acc.parallel, acc.kernels, acc.serial) and gpu.launch.
The passes that do these transformations will soon follow.
---
 .../mlir/Dialect/OpenACC/OpenACCCGOps.td      | 153 +++++++++++
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |   2 +-
 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp     | 243 ++++++++++++++++++
 mlir/test/Dialect/OpenACC/invalid-cg.mlir     |  19 ++
 mlir/test/Dialect/OpenACC/ops-cg.mlir         | 194 ++++++++++++++
 5 files changed, 610 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
index f179cfd752c62..be2a0ff3ec028 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
@@ -210,4 +210,157 @@ def OpenACC_FirstprivateMapInitialOp
   let extraClassDeclaration = extraClassDeclarationBase;
 }
 
+//===----------------------------------------------------------------------===//
+// acc.par_width
+//===----------------------------------------------------------------------===//
+
+def OpenACC_ParWidthOp
+    : OpenACC_Op<"par_width", [NoMemoryEffect, AlwaysSpeculatable]> {
+  let summary = "Specify parallel width for a GPU dimension";
+  let description = [{
+    The `acc.par_width` operation specifies the parallel width for a
+    given GPU parallel dimension. It is used as an input to
+    `acc.compute_region` to define the launch configuration.
+
+    The optional `launchArg` operand provides a known width value. When
+    absent, the width is unknown and must be determined later (either at
+    compile time by analysis or at runtime).
+
+    Examples:
+
+    ```mlir
+    // Known width from SSA value
+    %w1 = acc.par_width %vector_len {par_dim = #acc.par_dim<thread_x>}
+
+    // Unknown width (to be computed later)
+    %w2 = acc.par_width {par_dim = #acc.par_dim<block_x>}
+    ```
+  }];
+  let arguments = (ins Optional<Index>:$launchArg,
+                       OpenACC_GPUParallelDimAttr:$par_dim);
+  let results = (outs Index:$output);
+  let assemblyFormat = [{
+    ($launchArg^)? attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// acc.compute_region
+//===----------------------------------------------------------------------===//
+
+// Local type constraint for gpu::AsyncTokenType.
+def OpenACC_GPUAsyncTokenType : Type<
+  CPred<"::llvm::isa<::mlir::gpu::AsyncTokenType>($_self)">,
+  "GPU async token type">;
+
+def OpenACC_ComputeRegionOp
+    : OpenACC_Op<"compute_region",
+                 [OffloadRegionOpInterface, AffineScope,
+                  RecursiveMemoryEffects,
+                  SingleBlockImplicitTerminator<"YieldOp">,
+                  IsolatedFromAbove, AttrSizedOperandSegments]> {
+  let summary = "Compute region for GPU execution";
+  let description = [{
+    The `acc.compute_region` operation wraps a region of code that will be
+    compiled and executed on a GPU. It is typically produced by lowering
+    OpenACC compute constructs (`acc.parallel`, `acc.kernels`, `acc.serial`)
+    but can also be targeted directly by other frontends or lowered from
+    other constructs that benefit from the automatic parallelization and data
+    mapping facilities that the `acc` dialect provides. It serves as the
+    bridge between the high-level representation and the `gpu.launch`
+    operation.
+
+    The operation is `IsolatedFromAbove`: all values used inside the
+    region must be explicitly captured. Values are captured in two ways:
+
+    - Launch arguments (`launch`): Results of `acc.par_width`
+      operations that define the parallel launch configuration. These
+      become `index`-typed block arguments representing the parallel
+      width for each dimension.
+
+    - Input arguments (`ins`): Arbitrary values captured from outside
+      the region (data pointers, scalars, etc.). These become block
+      arguments with their original types.
+
+    The `origin` attribute records which construct produced this compute
+    region (e.g., `"acc.parallel"`, `"acc.kernels"`). This is intended to
+    be solely informational.
+
+    Example:
+
+    ```mlir
+    %w0 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+    %w1 = acc.par_width %c8 {par_dim = #acc.par_dim<block_x>}
+    acc.compute_region launch(%arg0 = %w0, %arg1 = %w1)
+        ins(%arg2 = %data) : (memref<1024xf32>) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1024 = arith.constant 1024 : index
+      scf.parallel (%iv) = (%c0) to (%c1024) step (%c1) {
+        %v = memref.load %arg2[%iv] : memref<1024xf32>
+        scf.reduce
+      } {acc.par_dims = #acc<par_dims[thread_x]>}
+      acc.yield
+    } {origin = "acc.parallel"}
+    ```
+  }];
+
+  let arguments = (ins Variadic<Index>:$launchArgs,
+                       Variadic<AnyType>:$inputArgs,
+                       Optional<OpenACC_GPUAsyncTokenType>:$stream,
+                       StrAttr:$origin,
+                       OptionalAttr<FlatSymbolRefAttr>:$kernel_func_name,
+                       OptionalAttr<FlatSymbolRefAttr>:$kernel_module_name);
+
+  let results = (outs Variadic<AnyType>:$results);
+
+  let regions = (region AnyRegion:$region);
+
+  let extraClassDeclaration = [{
+    /// Look up the par_width op for the given dimension among launch args.
+    std::optional<mlir::Value> getLaunchArg(
+        ::mlir::acc::GPUParallelDimAttr parDim);
+
+    /// Get the known (non-empty) launch value for a dimension.
+    std::optional<mlir::Value> getKnownLaunchArg(
+        ::mlir::acc::GPUParallelDimAttr parDim);
+
+    /// Get the known constant launch value for a dimension.
+    std::optional<uint64_t> getKnownConstantLaunchArg(
+        ::mlir::acc::GPUParallelDimAttr parDim);
+
+    /// Add a new input argument, appending to both the operand list and
+    /// the region block arguments. Returns the new block argument.
+    ::mlir::BlockArgument appendInputArg(::mlir::Value);
+
+    /// Check whether all parallel dimensions have width 1.
+    bool isEffectivelySerial();
+
+    /// Get the block argument representing the width for a given dimension.
+    ::mlir::BlockArgument parDimToWidth(
+        ::mlir::acc::GPUParallelDimAttr parDim);
+
+    /// Get the block argument for a specific gpu::Processor.
+    ::mlir::BlockArgument gpuParWidth(::mlir::gpu::Processor);
+
+    /// Collect all GPU parallel dimensions present in the launch config.
+    llvm::SmallVector<::mlir::acc::GPUParallelDimAttr> getLaunchParDims();
+
+    /// Get the body block of the compute region.
+    ::mlir::Block *getBody() { return &getRegion().front(); }
+
+    /// Get the terminator of the compute region.
+    ::mlir::Operation *getTerminator() {
+      return &getRegion().back().back();
+    }
+
+    /// Map a block argument back to its corresponding operand
+    /// ($launchArgs or $inputArgs).
+    ::mlir::Value getOperand(::mlir::BlockArgument blockArg);
+  }];
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
 #endif // OPENACC_CG_OPS
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2bb1654cb6369..33d3b84b32b98 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2845,7 +2845,7 @@ def OpenACC_LoopOp
 def OpenACC_YieldOp : OpenACC_Op<"yield", [Pure, ReturnLike, Terminator,
     ParentOneOf<["FirstprivateRecipeOp, LoopOp, ParallelOp, PrivateRecipeOp,"
                  "ReductionRecipeOp, ReductionInitOp, ReductionCombineRegionOp,"
-                 "SerialOp, AtomicUpdateOp"]>]> {
+                 "SerialOp, AtomicUpdateOp, ComputeRegionOp"]>]> {
   let summary = "Acc yield and termination operation";
 
   let description = [{
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index 881234cdd6198..6b0e1c07670ac 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -16,6 +16,7 @@
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Region.h"
@@ -325,6 +326,248 @@ void ReductionCombineOp::getEffects(
                        SideEffects::DefaultResource::get());
 }
 
+//===----------------------------------------------------------------------===//
+// ComputeRegionOp
+//===----------------------------------------------------------------------===//
+
+static ParWidthOp getParWidthOpForLaunchArg(ComputeRegionOp op,
+                                            GPUParallelDimAttr parDim) {
+  for (auto launchArg : op.getLaunchArgs()) {
+    auto parOp = launchArg.getDefiningOp<ParWidthOp>();
+    if (!parOp)
+      continue;
+    auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
+    if (launchArgDim == parDim)
+      return parOp;
+  }
+  return nullptr;
+}
+
+std::optional<Value>
+ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
+  if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
+    return parWidthOp.getResult();
+  return {};
+}
+
+std::optional<Value>
+ComputeRegionOp::getKnownLaunchArg(GPUParallelDimAttr parDim) {
+  if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
+    if (parWidthOp.getLaunchArg())
+      return parWidthOp.getLaunchArg();
+  return {};
+}
+
+std::optional<uint64_t>
+ComputeRegionOp::getKnownConstantLaunchArg(GPUParallelDimAttr parDim) {
+  auto knownParWidth = getKnownLaunchArg(parDim);
+  if (knownParWidth.has_value())
+    return getConstantIntValue(knownParWidth.value());
+  return {};
+}
+
+BlockArgument ComputeRegionOp::appendInputArg(Value value) {
+  getInputArgsMutable().append(value);
+  return getBody()->addArgument(value.getType(), getLoc());
+}
+
+bool ComputeRegionOp::isEffectivelySerial() {
+  auto *ctx = getContext();
+
+  if (getLaunchArg(GPUParallelDimAttr::seqDim(ctx)))
+    return true;
+
+  auto checkDim = [&](GPUParallelDimAttr dim) -> bool {
+    auto val = getKnownConstantLaunchArg(dim);
+    return val && *val == 1;
+  };
+
+  return checkDim(GPUParallelDimAttr::threadXDim(ctx)) &&
+         checkDim(GPUParallelDimAttr::threadYDim(ctx)) &&
+         checkDim(GPUParallelDimAttr::threadZDim(ctx)) &&
+         checkDim(GPUParallelDimAttr::blockXDim(ctx)) &&
+         checkDim(GPUParallelDimAttr::blockYDim(ctx)) &&
+         checkDim(GPUParallelDimAttr::blockZDim(ctx));
+}
+
+BlockArgument ComputeRegionOp::parDimToWidth(GPUParallelDimAttr parDim) {
+  for (auto [pos, launchArg] : llvm::enumerate(getLaunchArgs())) {
+    auto parOp = launchArg.getDefiningOp<ParWidthOp>();
+    assert(parOp);
+    auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
+    if (launchArgDim == parDim) {
+      assert(pos < getRegion().front().getNumArguments() &&
+             "launch arg position out of range");
+      return getRegion().front().getArgument(pos);
+    }
+  }
+  llvm_unreachable("attempting to get unspecified parDim");
+}
+
+SmallVector<GPUParallelDimAttr> ComputeRegionOp::getLaunchParDims() {
+  SmallVector<GPUParallelDimAttr> parDims;
+  for (auto launchArg : getLaunchArgs()) {
+    auto parOp = launchArg.getDefiningOp<ParWidthOp>();
+    auto launchArgDim = cast<GPUParallelDimAttr>(parOp.getParDim());
+    int64_t dimInt = launchArgDim.getValue().getInt();
+    parDims.push_back(intToParDim(getContext(), dimInt));
+  }
+  return parDims;
+}
+
+Value ComputeRegionOp::getOperand(BlockArgument blockArg) {
+  unsigned argNumber = blockArg.getArgNumber();
+  unsigned numLaunchArgs = getLaunchArgs().size();
+  unsigned numInputArgs = getInputArgs().size();
+  assert(argNumber < (numLaunchArgs + numInputArgs) &&
+         "invalid block argument");
+  if (argNumber < numLaunchArgs)
+    return getLaunchArgs()[argNumber];
+  return getInputArgs()[argNumber - numLaunchArgs];
+}
+
+BlockArgument ComputeRegionOp::gpuParWidth(gpu::Processor processor) {
+  return parDimToWidth(GPUParallelDimAttr::get(getContext(), processor));
+}
+
+LogicalResult ComputeRegionOp::verify() {
+  for (auto op : getLaunchArgs())
+    if (!op.getDefiningOp<acc::ParWidthOp>())
+      return emitOpError(
+          "launch arguments must be results of acc.par_width operations");
+
+  unsigned expectedBlockArgs =
+      getLaunchArgs().size() + getInputArgs().size();
+  unsigned actualBlockArgs = getRegion().front().getNumArguments();
+  if (expectedBlockArgs != actualBlockArgs)
+    return emitOpError("expected ")
+           << expectedBlockArgs << " block arguments (launch + input), got "
+           << actualBlockArgs;
+
+  return success();
+}
+
+void ComputeRegionOp::print(OpAsmPrinter &p) {
+  ValueRange regionArgs = getBody()->getArguments();
+  ValueRange launchArgs = getLaunchArgs();
+  ValueRange inputArgs = getInputArgs();
+
+  assert(regionArgs.size() == (launchArgs.size() + inputArgs.size()) &&
+         "region args mismatch");
+
+  if (getStream())
+    p << " stream(" << getStream() << " : " << getStream().getType() << ")";
+
+  size_t i = 0;
+  if (!launchArgs.empty()) {
+    p << " launch(";
+    for (size_t j = 0; j < launchArgs.size(); ++j, ++i) {
+      p << regionArgs[i] << " = " << launchArgs[j];
+      if (j < launchArgs.size() - 1)
+        p << ", ";
+    }
+    p << ")";
+  }
+  if (!inputArgs.empty()) {
+    p << " ins(";
+    for (size_t j = 0; j < inputArgs.size(); ++j, ++i) {
+      p << regionArgs[i] << " = " << inputArgs[j];
+      if (j < inputArgs.size() - 1)
+        p << ", ";
+    }
+    p << ") : (";
+    for (size_t j = 0; j < inputArgs.size(); ++j) {
+      p << inputArgs[j].getType();
+      if (j < inputArgs.size() - 1)
+        p << ", ";
+    }
+    p << ")";
+  }
+  p.printOptionalArrowTypeList(getResultTypes());
+  p << " ";
+  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false);
+  p.printOptionalAttrDict((*this)->getAttrs(),
+                          /*elidedAttrs=*/getOperandSegmentSizeAttr());
+}
+
+ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
+                                   OperationState &result) {
+  auto &builder = parser.getBuilder();
+
+  SmallVector<OpAsmParser::Argument> regionArgs;
+  OpAsmParser::UnresolvedOperand streamOperand;
+  Type streamType;
+  SmallVector<OpAsmParser::UnresolvedOperand> launchOperands;
+  SmallVector<OpAsmParser::UnresolvedOperand> inputOperands;
+  SmallVector<Type> types;
+
+  bool hasStream = false;
+  if (succeeded(parser.parseOptionalKeyword("stream"))) {
+    hasStream = true;
+    if (parser.parseLParen() || parser.parseOperand(streamOperand) ||
+        parser.parseColon() || parser.parseType(streamType) ||
+        parser.parseRParen())
+      return failure();
+  }
+
+  if (succeeded(parser.parseOptionalKeyword("launch"))) {
+    if (parser.parseAssignmentList(regionArgs, launchOperands))
+      return failure();
+    Type indexType = builder.getIndexType();
+    for (size_t i = 0; i < regionArgs.size(); ++i)
+      types.push_back(indexType);
+  }
+
+  if (succeeded(parser.parseOptionalKeyword("ins"))) {
+    if (parser.parseAssignmentList(regionArgs, inputOperands) ||
+        parser.parseColon() || parser.parseLParen() ||
+        parser.parseTypeList(types) || parser.parseRParen())
+      return failure();
+  }
+
+  if (parser.parseOptionalArrowTypeList(result.types))
+    return failure();
+
+  for (auto [iterArg, type] : llvm::zip_equal(regionArgs, types))
+    iterArg.type = type;
+
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, regionArgs))
+    return failure();
+
+  const size_t numLaunchOperands = launchOperands.size();
+  const size_t numInputOperands = inputOperands.size();
+  assert(numLaunchOperands + numInputOperands == regionArgs.size() &&
+         "compute region args mismatch");
+
+  result.addAttribute(ComputeRegionOp::getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr(
+                          {static_cast<int32_t>(numLaunchOperands),
+                           static_cast<int32_t>(numInputOperands),
+                           hasStream ? 1 : 0}));
+
+  for (size_t i = 0; i < numLaunchOperands; ++i) {
+    if (parser.resolveOperand(launchOperands[i], types[i], result.operands))
+      return failure();
+  }
+
+  for (size_t i = numLaunchOperands; i < regionArgs.size(); ++i) {
+    if (parser.resolveOperand(inputOperands[i - numLaunchOperands], types[i],
+                              result.operands))
+      return failure();
+  }
+
+  if (hasStream) {
+    if (parser.resolveOperand(streamOperand, streamType, result.operands))
+      return failure();
+  }
+
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // GPUParallelDimAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenACC/invalid-cg.mlir b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
index bc2408ceafe8a..d218bc505a5ea 100644
--- a/mlir/test/Dialect/OpenACC/invalid-cg.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
@@ -19,3 +19,22 @@ scf.parallel (%iv) = (%c0_2) to (%c4_2) step (%c1_2) {
   scf.reduce
 // expected-error at +1 {{expected one of ::mlir::gpu::Processor enum names}}
 } {acc.par_dims = #acc<par_dims[gang]>}
+
+// -----
+
+%c32 = arith.constant 32 : index
+// expected-error at +1 {{'acc.compute_region' op launch arguments must be results of acc.par_width operations}}
+acc.compute_region launch(%arg0 = %c32) {
+  acc.yield
+} {origin = "acc.parallel"}
+
+// -----
+
+// Use generic form to introduce an extra block argument.
+%c64 = arith.constant 64 : index
+%w = acc.par_width %c64 {par_dim = #acc.par_dim<thread_x>}
+// expected-error at +1 {{'acc.compute_region' op expected 1 block arguments (launch + input), got 2}}
+"acc.compute_region"(%w) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+^bb0(%arg0: index, %extra: index):
+  "acc.yield"() : () -> ()
+}) {origin = "acc.parallel"} : (index) -> ()
diff --git a/mlir/test/Dialect/OpenACC/ops-cg.mlir b/mlir/test/Dialect/OpenACC/ops-cg.mlir
index e6453da21ed79..7a61261d97ba9 100644
--- a/mlir/test/Dialect/OpenACC/ops-cg.mlir
+++ b/mlir/test/Dialect/OpenACC/ops-cg.mlir
@@ -77,3 +77,197 @@ func.func @par_dims_2d_grid() {
   return
 }
 // CHECK: acc.par_dims = #acc<par_dims[block_y, thread_y]>
+
+// -----
+
+// CHECK-LABEL: func @compute_region_single_dim
+func.func @compute_region_single_dim(%data: memref<1024xf32>,
+                                     %result: memref<f32>) {
+  %c128 = arith.constant 128 : index
+  %copyin = acc.copyin varPtr(%data : memref<1024xf32>) -> memref<1024xf32>
+  %copy = acc.copyin varPtr(%result : memref<f32>) -> memref<f32> {dataClause = #acc<data_clause acc_copy>}
+  acc.kernel_environment dataOperands(%copyin, %copy : memref<1024xf32>, memref<f32>) {
+    %w0 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+    acc.compute_region launch(%arg0 = %w0)
+        ins(%arg1 = %copyin, %arg2 = %copy) : (memref<1024xf32>, memref<f32>) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c128_inner = arith.constant 128 : index
+      %cst = arith.constant 0.000000e+00 : f32
+      memref.store %cst, %arg2[] : memref<f32>
+      scf.parallel (%iv) = (%c0) to (%c128_inner) step (%c1) {
+        %val = memref.load %arg1[%iv] : memref<1024xf32>
+        %cur = memref.load %arg2[] : memref<f32>
+        %sum = arith.addf %cur, %val : f32
+        memref.store %sum, %arg2[] : memref<f32>
+        scf.reduce
+      } {acc.par_dims = #acc<par_dims[thread_x]>}
+      acc.yield
+    } {origin = "acc.parallel"}
+  }
+  acc.copyout accPtr(%copy : memref<f32>) to varPtr(%result : memref<f32>) {dataClause = #acc<data_clause acc_copy>}
+  acc.delete accPtr(%copyin : memref<1024xf32>)
+  return
+}
+// CHECK: %[[W:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W]]) ins({{.*}}) : (memref<1024xf32>, memref<f32>) {
+// CHECK:   acc.yield
+// CHECK: } {origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_two_dims
+func.func @compute_region_two_dims(%data: memref<8xi32>,
+                                   %reduction_var: memref<i32>) {
+  %c8 = arith.constant 8 : index
+  %c128 = arith.constant 128 : index
+  %copyin_data = acc.copyin varPtr(%data : memref<8xi32>) -> memref<8xi32>
+  %copyin_red = acc.copyin varPtr(%reduction_var : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_reduction>}
+  acc.kernel_environment dataOperands(%copyin_data, %copyin_red : memref<8xi32>, memref<i32>) {
+    %w0 = acc.par_width %c8 {par_dim = #acc.par_dim<block_x>}
+    %w1 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+    acc.compute_region launch(%arg0 = %w0, %arg1 = %w1)
+        ins(%arg2 = %copyin_data, %arg3 = %copyin_red) : (memref<8xi32>, memref<i32>) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8_inner = arith.constant 8 : index
+      %c0_i32 = arith.constant 0 : i32
+      %init = acc.reduction_init %arg3 <add> : memref<i32> {
+        %alloca = memref.alloca() : memref<i32>
+        memref.store %c0_i32, %alloca[] : memref<i32>
+        acc.yield %alloca : memref<i32>
+      }
+      scf.parallel (%iv) = (%c0) to (%c8_inner) step (%c1) {
+        %v = memref.load %arg2[%iv] : memref<8xi32>
+        %cur = memref.load %init[] : memref<i32>
+        %sum = arith.addi %cur, %v : i32
+        memref.store %sum, %init[] : memref<i32>
+        scf.reduce
+      } {acc.par_dims = #acc<par_dims[block_x, thread_x]>}
+      acc.reduction_combine %init into %arg3 <add> : memref<i32>
+      acc.yield
+    } {origin = "acc.parallel"}
+  }
+  acc.copyout accPtr(%copyin_red : memref<i32>) to varPtr(%reduction_var : memref<i32>) {dataClause = #acc<data_clause acc_reduction>}
+  acc.delete accPtr(%copyin_data : memref<8xi32>)
+  return
+}
+// CHECK: %[[W0:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<block_x>}
+// CHECK: %[[W1:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W0]], %{{.*}} = %[[W1]]) ins({{.*}}) : (memref<8xi32>, memref<i32>) {
+// CHECK:   acc.yield
+// CHECK: } {origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_unknown_width
+func.func @compute_region_unknown_width(%data: memref<100xf32>) {
+  %copyin = acc.copyin varPtr(%data : memref<100xf32>) -> memref<100xf32>
+  acc.kernel_environment dataOperands(%copyin : memref<100xf32>) {
+    %w0 = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+    acc.compute_region launch(%arg0 = %w0)
+        ins(%arg1 = %copyin) : (memref<100xf32>) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c100 = arith.constant 100 : index
+      scf.parallel (%iv) = (%c0) to (%c100) step (%c1) {
+        scf.reduce
+      } {acc.par_dims = #acc<par_dims[thread_x]>}
+      acc.yield
+    } {origin = "acc.kernels"}
+  }
+  acc.delete accPtr(%copyin : memref<100xf32>)
+  return
+}
+// CHECK: %[[W:.*]] = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W]]) ins({{.*}}) : (memref<100xf32>) {
+// CHECK:   acc.yield
+// CHECK: } {origin = "acc.kernels"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_no_launch
+func.func @compute_region_no_launch(%a: memref<i32>, %b: memref<i32>) {
+  %copy_a = acc.copyin varPtr(%a : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_copy>}
+  %copy_b = acc.copyin varPtr(%b : memref<i32>) -> memref<i32> {dataClause = #acc<data_clause acc_copy>}
+  acc.kernel_environment dataOperands(%copy_a, %copy_b : memref<i32>, memref<i32>) {
+    acc.compute_region
+        ins(%arg0 = %copy_a, %arg1 = %copy_b) : (memref<i32>, memref<i32>) {
+      %c1 = arith.constant 1 : i32
+      memref.store %c1, %arg0[] : memref<i32>
+      memref.store %c1, %arg1[] : memref<i32>
+      acc.yield
+    } {origin = "acc.serial"}
+  }
+  acc.copyout accPtr(%copy_a : memref<i32>) to varPtr(%a : memref<i32>) {dataClause = #acc<data_clause acc_copy>}
+  acc.copyout accPtr(%copy_b : memref<i32>) to varPtr(%b : memref<i32>) {dataClause = #acc<data_clause acc_copy>}
+  return
+}
+// CHECK: acc.compute_region ins({{.*}}) : (memref<i32>, memref<i32>) {
+// CHECK:   acc.yield
+// CHECK: } {origin = "acc.serial"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_launch_only
+func.func @compute_region_launch_only() {
+  %c32 = arith.constant 32 : index
+  %w0 = acc.par_width %c32 {par_dim = #acc.par_dim<thread_x>}
+  acc.compute_region launch(%arg0 = %w0) {
+    acc.yield
+  } {origin = "acc.parallel"}
+  return
+}
+// CHECK: %[[W:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region launch(%{{.*}} = %[[W]]) {
+// CHECK:   acc.yield
+// CHECK: } {origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_all_fields
+// CHECK-SAME: (%{{.*}}: memref<1024xf32>, %[[STREAM:.*]]: !gpu.async.token)
+func.func @compute_region_all_fields(%data: memref<1024xf32>,
+                                     %stream: !gpu.async.token) {
+  %c128 = arith.constant 128 : index
+  %c8 = arith.constant 8 : index
+  %copyin = acc.copyin varPtr(%data : memref<1024xf32>) -> memref<1024xf32>
+  acc.kernel_environment dataOperands(%copyin : memref<1024xf32>) {
+    %w0 = acc.par_width %c8 {par_dim = #acc.par_dim<block_x>}
+    %w1 = acc.par_width %c128 {par_dim = #acc.par_dim<thread_x>}
+    acc.compute_region stream(%stream : !gpu.async.token)
+        launch(%arg0 = %w0, %arg1 = %w1)
+        ins(%arg2 = %copyin) : (memref<1024xf32>) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c1024 = arith.constant 1024 : index
+      scf.parallel (%iv) = (%c0) to (%c1024) step (%c1) {
+        scf.reduce
+      } {acc.par_dims = #acc<par_dims[block_x, thread_x]>}
+      acc.yield
+    } {kernel_func_name = @compute_kernel, kernel_module_name = @device_module, origin = "acc.parallel"}
+  }
+  acc.delete accPtr(%copyin : memref<1024xf32>)
+  return
+}
+// CHECK: %[[W0:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<block_x>}
+// CHECK: %[[W1:.*]] = acc.par_width %{{.*}} {par_dim = #acc.par_dim<thread_x>}
+// CHECK: acc.compute_region stream(%[[STREAM]] : !gpu.async.token) launch(%{{.*}} = %[[W0]], %{{.*}} = %[[W1]]) ins({{.*}}) : (memref<1024xf32>) {
+// CHECK:   acc.yield
+// CHECK: } {kernel_func_name = @compute_kernel, kernel_module_name = @device_module, origin = "acc.parallel"}
+
+// -----
+
+// CHECK-LABEL: func @compute_region_with_results
+func.func @compute_region_with_results() -> i32 {
+  %w0 = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+  %0 = acc.compute_region launch(%arg0 = %w0) -> i32 {
+    %c0_i32 = arith.constant 0 : i32
+    acc.yield %c0_i32 : i32
+  } {origin = "acc.parallel"}
+  return %0 : i32
+}
+// CHECK: %[[W:.*]] = acc.par_width {par_dim = #acc.par_dim<thread_x>}
+// CHECK: {{.*}} = acc.compute_region launch(%{{.*}} = %[[W]]) -> i32 {
+// CHECK:   acc.yield
+// CHECK: } {origin = "acc.parallel"}

>From ebb29dbeafcc62a437f8775599173ad0647503d4 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 5 Mar 2026 11:28:37 -0800
Subject: [PATCH 2/3] Fix format

---
 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index 6b0e1c07670ac..5de3e29800066 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -343,8 +343,7 @@ static ParWidthOp getParWidthOpForLaunchArg(ComputeRegionOp op,
   return nullptr;
 }
 
-std::optional<Value>
-ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
+std::optional<Value> ComputeRegionOp::getLaunchArg(GPUParallelDimAttr parDim) {
   if (auto parWidthOp = getParWidthOpForLaunchArg(*this, parDim))
     return parWidthOp.getResult();
   return {};
@@ -436,8 +435,7 @@ LogicalResult ComputeRegionOp::verify() {
       return emitOpError(
           "launch arguments must be results of acc.par_width operations");
 
-  unsigned expectedBlockArgs =
-      getLaunchArgs().size() + getInputArgs().size();
+  unsigned expectedBlockArgs = getLaunchArgs().size() + getInputArgs().size();
   unsigned actualBlockArgs = getRegion().front().getNumArguments();
   if (expectedBlockArgs != actualBlockArgs)
     return emitOpError("expected ")
@@ -540,11 +538,11 @@ ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
   assert(numLaunchOperands + numInputOperands == regionArgs.size() &&
          "compute region args mismatch");
 
-  result.addAttribute(ComputeRegionOp::getOperandSegmentSizeAttr(),
-                      builder.getDenseI32ArrayAttr(
-                          {static_cast<int32_t>(numLaunchOperands),
-                           static_cast<int32_t>(numInputOperands),
-                           hasStream ? 1 : 0}));
+  result.addAttribute(
+      ComputeRegionOp::getOperandSegmentSizeAttr(),
+      builder.getDenseI32ArrayAttr({static_cast<int32_t>(numLaunchOperands),
+                                    static_cast<int32_t>(numInputOperands),
+                                    hasStream ? 1 : 0}));
 
   for (size_t i = 0; i < numLaunchOperands; ++i) {
     if (parser.resolveOperand(launchOperands[i], types[i], result.operands))

>From 68738721dc0647d275efd882f828bbd4985bb6de Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Thu, 5 Mar 2026 12:57:21 -0800
Subject: [PATCH 3/3] Add ParWidth type to capture par dimension info

---
 mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td    | 12 ++++++------
 mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td |  8 ++++++++
 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp            |  9 ++-------
 mlir/test/Dialect/OpenACC/invalid-cg.mlir            |  5 +++--
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
index be2a0ff3ec028..ebb0e6132fee3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
@@ -238,7 +238,7 @@ def OpenACC_ParWidthOp
   }];
   let arguments = (ins Optional<Index>:$launchArg,
                        OpenACC_GPUParallelDimAttr:$par_dim);
-  let results = (outs Index:$output);
+  let results = (outs OpenACC_ParWidthType:$output);
   let assemblyFormat = [{
     ($launchArg^)? attr-dict
   }];
@@ -273,10 +273,10 @@ def OpenACC_ComputeRegionOp
     The operation is `IsolatedFromAbove`: all values used inside the
     region must be explicitly captured. Values are captured in two ways:
 
-    - Launch arguments (`launch`): Results of `acc.par_width`
-      operations that define the parallel launch configuration. These
-      become `index`-typed block arguments representing the parallel
-      width for each dimension.
+    - Launch arguments (`launch`): Results of operations that define
+      the parallel launch configuration. These are `!acc.par_width`-typed
+      and become block arguments representing the parallel width for each
+      dimension.
 
     - Input arguments (`ins`): Arbitrary values captured from outside
       the region (data pointers, scalars, etc.). These become block
@@ -305,7 +305,7 @@ def OpenACC_ComputeRegionOp
     ```
   }];
 
-  let arguments = (ins Variadic<Index>:$launchArgs,
+  let arguments = (ins Variadic<OpenACC_ParWidthType>:$launchArgs,
                        Variadic<AnyType>:$inputArgs,
                        Optional<OpenACC_GPUAsyncTokenType>:$stream,
                        StrAttr:$origin,
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td
index 117272693d626..bba385e69c0f2 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsTypes.td
@@ -33,4 +33,12 @@ def OpenACC_DeclareTokenType : OpenACC_Type<"DeclareToken", "declare_token"> {
   }];
 }
 
+def OpenACC_ParWidthType : OpenACC_Type<"ParWidth", "par_width"> {
+  let summary = "parallel width token type";
+  let description = [{
+    Represents a type that is consumed by a compute region in order to
+    capture its parallelism dimensions arguments.
+  }];
+}
+
 #endif // OPENACC_OPS_TYPES
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index 5de3e29800066..85dba70dbde1d 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -430,11 +430,6 @@ BlockArgument ComputeRegionOp::gpuParWidth(gpu::Processor processor) {
 }
 
 LogicalResult ComputeRegionOp::verify() {
-  for (auto op : getLaunchArgs())
-    if (!op.getDefiningOp<acc::ParWidthOp>())
-      return emitOpError(
-          "launch arguments must be results of acc.par_width operations");
-
   unsigned expectedBlockArgs = getLaunchArgs().size() + getInputArgs().size();
   unsigned actualBlockArgs = getRegion().front().getNumArguments();
   if (expectedBlockArgs != actualBlockArgs)
@@ -511,9 +506,9 @@ ParseResult ComputeRegionOp::parse(OpAsmParser &parser,
   if (succeeded(parser.parseOptionalKeyword("launch"))) {
     if (parser.parseAssignmentList(regionArgs, launchOperands))
       return failure();
-    Type indexType = builder.getIndexType();
+    auto parWidthType = acc::ParWidthType::get(builder.getContext());
     for (size_t i = 0; i < regionArgs.size(); ++i)
-      types.push_back(indexType);
+      types.push_back(parWidthType);
   }
 
   if (succeeded(parser.parseOptionalKeyword("ins"))) {
diff --git a/mlir/test/Dialect/OpenACC/invalid-cg.mlir b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
index d218bc505a5ea..f788e6c03bcc9 100644
--- a/mlir/test/Dialect/OpenACC/invalid-cg.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid-cg.mlir
@@ -22,8 +22,9 @@ scf.parallel (%iv) = (%c0_2) to (%c4_2) step (%c1_2) {
 
 // -----
 
+// expected-note at +1 {{prior use here}}
 %c32 = arith.constant 32 : index
-// expected-error at +1 {{'acc.compute_region' op launch arguments must be results of acc.par_width operations}}
+// expected-error at +1 {{use of value '%c32' expects different type than prior uses: '!acc.par_width' vs 'index'}}
 acc.compute_region launch(%arg0 = %c32) {
   acc.yield
 } {origin = "acc.parallel"}
@@ -37,4 +38,4 @@ acc.compute_region launch(%arg0 = %c32) {
 "acc.compute_region"(%w) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
 ^bb0(%arg0: index, %extra: index):
   "acc.yield"() : () -> ()
-}) {origin = "acc.parallel"} : (index) -> ()
+}) {origin = "acc.parallel"} : (!acc.par_width) -> ()