[Mlir-commits] [mlir] [mlir][acc] Add ACCComputeLowering pass (PR #185501)

Tue Mar 10 10:46:55 PDT 2026

https://github.com/razvanlupusoru updated https://github.com/llvm/llvm-project/pull/185501

>From 7f575dfc69243f3543d3075caa0a66453cf9d8bf Mon Sep 17 00:00:00 2001
From: Scott Manley <rscottmanley at gmail.com>
Date: Mon, 9 Mar 2026 12:41:25 -0700
Subject: [PATCH 1/3] [mlir][acc] Add ACCComputeLowering pass

Introduce a pass that lowers OpenACC compute constructs to a
representation that separates the data environment from the
compute body and prepares for parallelism assignment and
privatization at the right granularity.

- Decompose acc.parallel, acc.serial, and acc.kernels into
  acc.kernel_environment and acc.compute_region. Launch arguments
  (num_gangs, num_workers, vector_length) are turned into
  acc.par_width and passed as compute_region launch operands.
- Convert acc.loop to SCF based on context: unstructured loops to
  scf.execute_region; sequential (serial or seq) to scf.parallel
  with par_dims=sequential; auto loops to scf.for (with collapse
  when multi-dimensional); orphan loops to scf.for; independent
  loops in parallel/kernels to scf.parallel with par_dims from the
  GPU mapping.
---
 .../mlir/Dialect/OpenACC/OpenACCCGOps.td      |  11 +
 .../mlir/Dialect/OpenACC/OpenACCUtils.h       |   5 +-
 .../mlir/Dialect/OpenACC/OpenACCUtilsCG.h     |  24 +-
 .../mlir/Dialect/OpenACC/OpenACCUtilsLoop.h   |  16 +-
 .../mlir/Dialect/OpenACC/Transforms/Passes.td |  71 ++--
 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp     |  28 +-
 .../OpenACC/Transforms/ACCComputeLowering.cpp | 361 ++++++++++++++++++
 .../Dialect/OpenACC/Transforms/CMakeLists.txt |   3 +
 .../Dialect/OpenACC/Utils/OpenACCUtils.cpp    |   3 +-
 .../Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp  |  53 +++
 .../OpenACC/acc-compute-lowering-compute.mlir | 107 ++++++
 .../OpenACC/acc-compute-lowering-loop.mlir    | 130 +++++++
 .../acc-compute-lowering-unstructured.mlir    |  34 ++
 .../Dialect/OpenACC/OpenACCUtilsCGTest.cpp    |  77 +++-
 14 files changed, 879 insertions(+), 44 deletions(-)
 create mode 100644 mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
 create mode 100644 mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir
 create mode 100644 mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
 create mode 100644 mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
index ebb0e6132fee3..f6ae871eb9936 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCCGOps.td
@@ -181,6 +181,17 @@ def OpenACC_KernelEnvironmentOp
   }];
 
   let hasCanonicalizer = 1;
+
+  let extraClassDeclaration = [{
+    /// Create a `KernelEnvironmentOp` populated with data mapping, async, and
+    /// wait clauses extracted from the given ACC compute construct. Emplaces
+    /// a block in the region and sets the rewriter's insertion point to the
+    /// start of that block so callers can create operations inside it
+    /// (e.g., `acc.compute_region`).
+    template <typename ComputeConstructT>
+    static KernelEnvironmentOp createAndPopulate(
+        ComputeConstructT computeConstruct, ::mlir::OpBuilder &builder);
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index aac3bf7ed67c8..dd3d34b8252d3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -10,10 +10,13 @@
 #define MLIR_DIALECT_OPENACC_OPENACCUTILS_H_
 
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Remarks.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include <functional>
+#include "llvm/ADT/Twine.h"
+#include <optional>
+#include <string>
 
 namespace mlir {
 class DominanceInfo;
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
index 7bead720b1077..b5fdcca761a4a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
@@ -14,12 +14,12 @@
 #ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSCG_H_
 #define MLIR_DIALECT_OPENACC_OPENACCUTILSCG_H_
 
+#include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/IR/IRMapping.h"
 #include <optional>
 
 namespace mlir {
-class Operation;
-
 namespace acc {
 
 /// Get the data layout for an operation.
@@ -34,6 +34,26 @@ namespace acc {
 std::optional<DataLayout> getDataLayout(Operation *op,
                                         bool allowDefault = true);
 
+/// Build an `acc.compute_region` operation by cloning a source region.
+///
+/// Creates a new `acc.compute_region` with the given launch arguments and
+/// origin string, then clones the operations from `regionToClone` into its
+/// body. Multi-block regions are wrapped with `scf.execute_region`.
+///
+/// The `mapping` is used and updated during cloning, allowing callers to
+/// track value correspondences. Optional `output`, `kernelFuncName`,
+/// `kernelModuleName`, and `stream` arguments are forwarded to the op.
+ComputeRegionOp buildComputeRegion(Location loc, ValueRange launchArgs,
+                                   ValueRange inputArgs,
+                                   llvm::StringRef origin,
+                                   Region &regionToClone,
+                                   RewriterBase &rewriter,
+                                   IRMapping &mapping,
+                                   ValueRange output = {},
+                                   FlatSymbolRefAttr kernelFuncName = {},
+                                   FlatSymbolRefAttr kernelModuleName = {},
+                                   Value stream = {});
+
 } // namespace acc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
index 67ae2d1ede04e..e0428bec620ca 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsLoop.h
@@ -13,22 +13,12 @@
 #ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
 #define MLIR_DIALECT_OPENACC_OPENACCUTILSLOOP_H_
 
-#include "mlir/IR/Block.h"
-#include "mlir/IR/ValueRange.h"
-#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/IRMapping.h"
 
 namespace mlir {
-class IRMapping;
-class Location;
-class Region;
-class RewriterBase;
-namespace scf {
-class ForOp;
-class ParallelOp;
-class ExecuteRegionOp;
-} // namespace scf
 namespace acc {
-class LoopOp;
 
 /// Clone an ACC region into a destination block at the given insertion point.
 /// Requires a single-block source region. Maps block arguments and optional
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 8e00846255254..9ab99208f83c7 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -11,6 +11,26 @@
 
 include "mlir/Pass/PassBase.td"
 
+//===----------------------------------------------------------------------===//
+// Common options shared by multiple ACC passes
+//===----------------------------------------------------------------------===//
+
+def AccDeviceTypeOption : Option<"deviceType", "device-type",
+    "mlir::acc::DeviceType", "mlir::acc::DeviceType::None",
+    "Target device type. One use case is ensuring that device_type-specific "
+    "clauses are considered. Another is device-specific specializations.",
+    [{::llvm::cl::values(
+       clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
+       clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
+       clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
+       clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
+       clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
+    }]>;
+
+//===----------------------------------------------------------------------===//
+// Pass definitions
+//===----------------------------------------------------------------------===//
+
 def LegalizeDataValuesInRegion : Pass<"openacc-legalize-data-values", "mlir::func::FuncOp"> {
   let summary = "Legalizes SSA values in compute regions with results from data clause operations";
   let description = [{
@@ -120,20 +140,7 @@ def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> {
        while avoiding infinite recursion through proper tracking.
   }];
   let dependentDialects = ["mlir::acc::OpenACCDialect"];
-  let options = [
-    Option<"deviceType", "device-type", "mlir::acc::DeviceType",
-           "mlir::acc::DeviceType::None",
-           "Target device type for implicit routine generation. "
-           "Ensures that `acc routine` device_type clauses are "
-           "properly considered not just default clauses.",
-           [{::llvm::cl::values(
-              clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
-              clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
-              clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
-              clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
-              clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
-           }]>
-  ];
+  let options = [ AccDeviceTypeOption ];
 }
 
 def ACCDeclareGPUModuleInsertion : Pass<"acc-declare-gpu-module-insertion", "mlir::ModuleOp"> {
@@ -412,21 +419,35 @@ def OffloadTargetVerifier : Pass<"offload-target-verifier", "mlir::func::FuncOp"
   }];
   let dependentDialects = ["mlir::acc::OpenACCDialect"];
   let options = [
-    Option<"deviceType", "device-type", "mlir::acc::DeviceType",
-           "mlir::acc::DeviceType::None",
-           "Target device type for verification. Host/multicore uses host "
-           "region checking, all others use device region checking.",
-           [{::llvm::cl::values(
-              clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
-              clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
-              clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
-              clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
-              clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
-           }]>,
+    AccDeviceTypeOption,
     Option<"softCheck", "soft-check", "bool", "false",
            "When true, illegal values are printed via LLVM_DEBUG instead of "
            "failing compilation. Useful for diagnostic purposes.">
   ];
 }
 
+def ACCComputeLowering : Pass<"acc-compute-lowering", "mlir::func::FuncOp"> {
+  let summary = "Lower ACC compute constructs to acc.compute_region";
+  let description = [{
+    Converts ACC frontend compute constructs (`acc.parallel`, `acc.kernels`,
+    `acc.serial`) to `acc.compute_region` wrapped in `acc.kernel_environment`.
+    Converts `acc.loop` to SCF parallel/for loops with parallel dimension
+    annotations.
+
+    The pass applies two phases of pattern rewrites:
+    1. Loop conversion: `acc.loop` is converted to `scf.parallel` or `scf.for`
+       while the parent compute construct is still present (needed to determine
+       loop conversion strategy).
+    2. Compute construct conversion: `acc.parallel`, `acc.kernels`, and
+       `acc.serial` are replaced by `acc.kernel_environment` containing
+       `acc.compute_region`.
+  }];
+  let dependentDialects = [
+    "mlir::acc::OpenACCDialect",
+    "mlir::arith::ArithDialect",
+    "mlir::scf::SCFDialect"
+  ];
+  let options = [ AccDeviceTypeOption ];
+}
+
 #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index e77a955ef3045..ba677082ba4e2 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -218,6 +218,31 @@ void KernelEnvironmentOp::getCanonicalizationPatterns(
   results.add<RemoveEmptyKernelEnvironment>(context);
 }
 
+template <typename ComputeConstructT>
+KernelEnvironmentOp KernelEnvironmentOp::createAndPopulate(
+    ComputeConstructT computeConstruct, OpBuilder &builder) {
+  auto kernelEnvironment = KernelEnvironmentOp::create(
+      builder, computeConstruct->getLoc(),
+      computeConstruct.getDataClauseOperands(),
+      computeConstruct.getAsyncOperands(),
+      computeConstruct.getAsyncOperandsDeviceTypeAttr(),
+      computeConstruct.getAsyncOnlyAttr(), computeConstruct.getWaitOperands(),
+      computeConstruct.getWaitOperandsSegmentsAttr(),
+      computeConstruct.getWaitOperandsDeviceTypeAttr(),
+      computeConstruct.getHasWaitDevnumAttr(),
+      computeConstruct.getWaitOnlyAttr());
+  Block &block = kernelEnvironment.getRegion().emplaceBlock();
+  builder.setInsertionPointToStart(&block);
+  return kernelEnvironment;
+}
+
+template KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate<ParallelOp>(ParallelOp, OpBuilder &);
+template KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate<KernelsOp>(KernelsOp, OpBuilder &);
+template KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate<SerialOp>(SerialOp, OpBuilder &);
+
 //===----------------------------------------------------------------------===//
 // FirstprivateMapInitialOp
 //===----------------------------------------------------------------------===//
@@ -417,7 +442,8 @@ SmallVector<GPUParallelDimAttr> ComputeRegionOp::getLaunchParDims() {
 Value ComputeRegionOp::getOperand(BlockArgument blockArg) {
   unsigned argNumber = blockArg.getArgNumber();
   unsigned numLaunchArgs = getLaunchArgs().size();
-  assert(argNumber < (numLaunchArgs + getInputArgs().size()) &&
+  unsigned numInputArgs = getInputArgs().size();
+  assert(argNumber < (numLaunchArgs + numInputArgs) &&
          "invalid block argument");
   if (argNumber < numLaunchArgs)
     return getLaunchArgs()[argNumber];
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
new file mode 100644
index 0000000000000..8bb5dc9eb43d7
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
@@ -0,0 +1,361 @@
+//===- ACCComputeLowering.cpp - Lower ACC compute to compute_region -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass decomposes OpenACC compute constructs into a representation that
+// separates the data environment from the compute portion and prepares for
+// parallelism assignment and privatization at the appropriate level.
+//
+// Overview:
+// ---------
+// Each compute construct (`acc.parallel`, `acc.serial`, `acc.kernels`) is
+// lowered to (1) `acc.kernel_environment`, which captures the data environment
+// and (2) `acc.compute_region`, which holds the compute body. Inside the
+// compute region, acc.loop is converted to SCF loops (`scf.parallel` or
+// `scf.for`) with any predetermined parallelism expressed as `par_dims`. This
+// decomposition allows later phases to assign parallelism and handle
+// privatization at the right granularity.
+//
+// Transformations:
+// ----------------
+// 1. Compute constructs: acc.parallel, acc.serial, and acc.kernels are
+//    replaced by acc.kernel_environment containing a single acc.compute_region.
+//    Launch arguments (num_gangs, num_workers, vector_length) become
+//    acc.par_width ops and are passed as compute_region launch operands.
+//
+// 2. acc.loop: Converted according to context and attributes:
+//    - Unstructured: body wrapped in scf.execute_region.
+//    - Sequential (serial region or seq clause): scf.parallel with
+//      par_dims = sequential.
+//    - Auto (in parallel/kernels): scf.for with collapse when
+//    multi-dimensional.
+//    - Orphan (not inside a compute construct): scf.for, no collapse.
+//    - Independent (in parallel/kernels): scf.parallel with par_dims from
+//      gang/worker/vector mapping (e.g. block_x).
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenACC/OpenACCParMapping.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCCOMPUTELOWERING
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+#define DEBUG_TYPE "acc-compute-lowering"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+/// Strip index_cast operations from a value before checking for a constant.
+static Value stripIndexCasts(Value val) {
+  while (auto castOp = val.getDefiningOp<arith::IndexCastOp>())
+    val = castOp.getIn();
+  return val;
+}
+
+/// A parallel construct is "effectively serial" when it specifies
+/// num_gangs(1), num_workers(1), and vector_length(1). This matches
+/// the semantics of acc.serial but expressed through acc.parallel.
+static bool isEffectivelySerial(ParallelOp op) {
+  auto numGangs = op.getNumGangsValues();
+  if (numGangs.size() != 1)
+    return false;
+  Value numWorkers = op.getNumWorkersValue();
+  if (!numWorkers)
+    return false;
+  Value vectorLength = op.getVectorLengthValue();
+  if (!vectorLength)
+    return false;
+  return isConstantIntValue(stripIndexCasts(numGangs.front()), 1) &&
+         isConstantIntValue(stripIndexCasts(numWorkers), 1) &&
+         isConstantIntValue(stripIndexCasts(vectorLength), 1);
+}
+
+static bool isOpInComputeRegion(Operation *op) {
+  Region *region = op->getBlock()->getParent();
+  return getEnclosingComputeOp(*region) != nullptr;
+}
+
+static bool isOpInSerialRegion(Operation *op) {
+  if (auto parallelOp = op->getParentOfType<ParallelOp>())
+    return isEffectivelySerial(parallelOp);
+  if (auto computeRegion = op->getParentOfType<ComputeRegionOp>())
+    return computeRegion.isEffectivelySerial();
+  if (op->getParentOfType<SerialOp>())
+    return true;
+  return false;
+}
+
+static void setParDimsAttr(Operation *op, GPUParallelDimsAttr attr) {
+  op->setAttr(GPUParallelDimsAttr::name, attr);
+}
+
+/// Insert a parallel dimension into the list, maintaining order by
+/// GPUParallelDimAttr::getOrder (descending).
+static void insertParDim(SmallVectorImpl<GPUParallelDimAttr> &parDims,
+                         GPUParallelDimAttr parDim) {
+  GPUParallelDimAttr *lb = llvm::lower_bound(
+      parDims, parDim,
+      [](const GPUParallelDimAttr &a, const GPUParallelDimAttr &b) {
+        return a.getOrder() > b.getOrder();
+      });
+  if (lb == parDims.end() || *lb != parDim)
+    parDims.insert(lb, parDim);
+}
+
+/// Map loop parallelism clauses (gang/worker/vector) to GPU parallel
+/// dimensions using the given mapping policy.
+static SmallVector<GPUParallelDimAttr>
+getParallelDimensions(LoopOp loopOp, const ACCToGPUMappingPolicy &policy,
+                      DeviceType deviceType) {
+  SmallVector<GPUParallelDimAttr> parDims;
+  auto *ctx = loopOp->getContext();
+
+  if (loopOp.hasVector(deviceType))
+    insertParDim(parDims, policy.vectorDim(ctx));
+  if (loopOp.hasWorker(deviceType))
+    insertParDim(parDims, policy.workerDim(ctx));
+  if (auto gangDimValue = loopOp.getGangValue(GangArgType::Dim, deviceType)) {
+    if (auto gangDimDefOp =
+            gangDimValue.getDefiningOp<arith::ConstantIntOp>()) {
+      auto gangLevel = getGangParLevel(gangDimDefOp.value());
+      insertParDim(parDims, policy.gangDim(ctx, gangLevel));
+    }
+  } else if (loopOp.hasGang(deviceType)) {
+    insertParDim(parDims, policy.gangDim(ctx, ParLevel::gang_dim1));
+  }
+  return parDims;
+}
+
+/// Create acc.par_width operations from gang/worker/vector values of a
+/// compute construct. Queries the device-type-specific values first, falling
+/// back to the default (DeviceType::None) values.
+template <typename ComputeConstructT>
+static SmallVector<Value>
+assignKnownLaunchArgs(ComputeConstructT computeOp, DeviceType deviceType,
+                      RewriterBase &rewriter,
+                      const ACCToGPUMappingPolicy &policy) {
+  SmallVector<Value> values;
+  auto *ctx = rewriter.getContext();
+  auto indexTy = rewriter.getIndexType();
+  auto loc = computeOp->getLoc();
+
+  auto numGangs = computeOp.getNumGangsValues(deviceType);
+  if (numGangs.empty())
+    numGangs = computeOp.getNumGangsValues();
+  for (auto [gangDimIdx, gangSize] : llvm::enumerate(numGangs)) {
+    auto gangLevel = getGangParLevel(gangDimIdx + 1);
+    values.push_back(
+        ParWidthOp::create(rewriter, loc,
+                           getValueOrCreateCastToIndexLike(
+                               rewriter, gangSize.getLoc(), indexTy, gangSize),
+                           policy.gangDim(ctx, gangLevel)));
+  }
+
+  Value numWorkers = computeOp.getNumWorkersValue(deviceType);
+  if (!numWorkers)
+    numWorkers = computeOp.getNumWorkersValue();
+  if (numWorkers) {
+    values.push_back(ParWidthOp::create(
+        rewriter, loc,
+        getValueOrCreateCastToIndexLike(rewriter, numWorkers.getLoc(), indexTy,
+                                        numWorkers),
+        policy.workerDim(ctx)));
+  }
+
+  Value vectorLength = computeOp.getVectorLengthValue(deviceType);
+  if (!vectorLength)
+    vectorLength = computeOp.getVectorLengthValue();
+  if (vectorLength) {
+    values.push_back(ParWidthOp::create(
+        rewriter, loc,
+        getValueOrCreateCastToIndexLike(rewriter, vectorLength.getLoc(),
+                                        indexTy, vectorLength),
+        policy.vectorDim(ctx)));
+  }
+  return values;
+}
+
+/// SerialOp has no gang/worker/vector clauses.
+template <>
+SmallVector<Value>
+assignKnownLaunchArgs<SerialOp>(SerialOp, DeviceType, RewriterBase &,
+                                const ACCToGPUMappingPolicy &) {
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// Loop conversion pattern
+//===----------------------------------------------------------------------===//
+
+class ACCLoopConversion : public OpRewritePattern<LoopOp> {
+public:
+  ACCLoopConversion(MLIRContext *ctx, const ACCToGPUMappingPolicy &policy,
+                    DeviceType deviceType)
+      : OpRewritePattern<LoopOp>(ctx), policy(policy), deviceType(deviceType) {}
+
+  LogicalResult matchAndRewrite(LoopOp loopOp,
+                                PatternRewriter &rewriter) const override {
+    if (loopOp.getUnstructured()) {
+      auto executeRegion =
+          convertUnstructuredACCLoopToSCFExecuteRegion(loopOp, rewriter);
+      if (!executeRegion)
+        return failure();
+      rewriter.replaceOp(loopOp, executeRegion);
+      return success();
+    }
+
+    LoopParMode parMode = loopOp.getDefaultOrDeviceTypeParallelism(deviceType);
+
+    if (parMode == LoopParMode::loop_seq || isOpInSerialRegion(loopOp)) {
+      // Although it might seem unintuitive, scf.parallel is used here because
+      // the parallelism of the loop is already predetermined (as sequential).
+      // scf.for will become a candidate for auto-parallelization analysis.
+      auto parallelOp = convertACCLoopToSCFParallel(loopOp, rewriter);
+      if (!parallelOp)
+        return failure();
+      setParDimsAttr(parallelOp,
+                     GPUParallelDimsAttr::seq(loopOp->getContext()));
+      rewriter.replaceOp(loopOp, parallelOp);
+    } else if (parMode == LoopParMode::loop_auto) {
+      // All loops in serial regions should have already been handled.
+      assert(!isOpInSerialRegion(loopOp) &&
+             "Expected loop to be in non-serial region");
+      // Mark as scf.for to allow auto-parallelization analysis later.
+      auto forOp =
+          convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/true);
+      if (!forOp)
+        return failure();
+      rewriter.replaceOp(loopOp, forOp);
+    } else if (!isOpInComputeRegion(loopOp)) {
+      // This loop is an orphan `acc loop` but it is not in any sort
+      // of compute region. Thus it is just a sequential non-accelerator loop.
+      auto forOp =
+          convertACCLoopToSCFFor(loopOp, rewriter, /*enableCollapse=*/false);
+      if (!forOp)
+        return failure();
+      rewriter.replaceOp(loopOp, forOp);
+    } else {
+      assert(parMode == LoopParMode::loop_independent &&
+             "Expected loop to be independent");
+      auto parallelOp = convertACCLoopToSCFParallel(loopOp, rewriter);
+      if (!parallelOp)
+        return failure();
+
+      SmallVector<GPUParallelDimAttr> parDims =
+          getParallelDimensions(loopOp, policy, deviceType);
+      if (!parDims.empty()) {
+        auto parDimsAttr =
+            GPUParallelDimsAttr::get(loopOp->getContext(), parDims);
+        setParDimsAttr(parallelOp, parDimsAttr);
+      }
+
+      rewriter.replaceOp(loopOp, parallelOp);
+    }
+    return success();
+  }
+
+private:
+  const ACCToGPUMappingPolicy &policy;
+  DeviceType deviceType;
+};
+
+//===----------------------------------------------------------------------===//
+// Compute construct conversion pattern
+//===----------------------------------------------------------------------===//
+
+template <typename ComputeConstructT>
+class ComputeOpConversion : public OpRewritePattern<ComputeConstructT> {
+public:
+  ComputeOpConversion(MLIRContext *ctx, const ACCToGPUMappingPolicy &policy,
+                      DeviceType deviceType)
+      : OpRewritePattern<ComputeConstructT>(ctx), policy(policy),
+        deviceType(deviceType) {}
+
+  LogicalResult matchAndRewrite(ComputeConstructT computeOp,
+                                PatternRewriter &rewriter) const override {
+    rewriter.setInsertionPoint(computeOp);
+    auto kernelEnv =
+        KernelEnvironmentOp::createAndPopulate(computeOp, rewriter);
+    auto launchArgs =
+        assignKnownLaunchArgs(computeOp, deviceType, rewriter, policy);
+    Region &region = computeOp.getRegion();
+    SetVector<Value> liveInValues;
+    getUsedValuesDefinedAbove(region, region, liveInValues);
+    IRMapping mapping;
+    auto computeRegion = buildComputeRegion(
+        computeOp->getLoc(), launchArgs, liveInValues.getArrayRef(),
+        ComputeConstructT::getOperationName(), region, rewriter, mapping);
+    if (!computeRegion) {
+      rewriter.eraseOp(kernelEnv);
+      return failure();
+    }
+    rewriter.eraseOp(computeOp);
+    return success();
+  }
+
+private:
+  const ACCToGPUMappingPolicy &policy;
+  DeviceType deviceType;
+};
+
+//===----------------------------------------------------------------------===//
+// Pass implementation
+//===----------------------------------------------------------------------===//
+
+class ACCComputeLowering
+    : public acc::impl::ACCComputeLoweringBase<ACCComputeLowering> {
+public:
+  using ACCComputeLoweringBase::ACCComputeLoweringBase;
+
+  void runOnOperation() override {
+    auto op = getOperation();
+    auto *context = op.getContext();
+
+    DefaultACCToGPUMappingPolicy policy;
+
+    // Part 1: Convert acc.loop to scf.parallel/scf.for while the parent
+    // compute construct is still present (needed to determine conversion
+    // strategy).
+    RewritePatternSet loopPatterns(context);
+    loopPatterns.insert<ACCLoopConversion>(context, policy, deviceType);
+    if (failed(applyPatternsGreedily(op, std::move(loopPatterns))))
+      return signalPassFailure();
+
+    // Part 2: Convert acc.parallel, acc.kernels, and acc.serial to
+    // acc.kernel_environment { acc.compute_region { ... } }.
+    RewritePatternSet computePatterns(context);
+    computePatterns
+        .insert<ComputeOpConversion<ParallelOp>, ComputeOpConversion<KernelsOp>,
+                ComputeOpConversion<SerialOp>>(context, policy, deviceType);
+    if (failed(applyPatternsGreedily(op, std::move(computePatterns))))
+      return signalPassFailure();
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 3d85fd805ace1..1bb16b4b9642d 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIROpenACCTransforms
+  ACCComputeLowering.cpp
   ACCDeclareGPUModuleInsertion.cpp
   ACCIfClauseLowering.cpp
   ACCImplicitData.cpp
@@ -27,6 +28,8 @@ add_mlir_dialect_library(MLIROpenACCTransforms
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
+  MLIRArithDialect
+  MLIRArithUtils
   MLIROpenACCAnalysis
   MLIROpenACCDialect
   MLIROpenACCUtils
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 911f256a3d2a6..1c63760a6984b 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -21,7 +21,8 @@
 #include "llvm/Support/Casting.h"
 
 mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
-  return region.getParentOfType<ACC_COMPUTE_CONSTRUCT_OPS>();
+  return region
+      .getParentOfType<ACC_COMPUTE_CONSTRUCT_OPS, mlir::acc::ComputeRegionOp>();
 }
 
 template <typename OpTy>
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp
index 5c5c453f2cae0..f5e0e5c33fee4 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsCG.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
 
 namespace mlir {
 namespace acc {
@@ -51,5 +54,55 @@ std::optional<DataLayout> getDataLayout(Operation *op, bool allowDefault) {
   return std::nullopt;
 }
 
+ComputeRegionOp
+buildComputeRegion(Location loc, ValueRange launchArgs, ValueRange inputArgs,
+                   llvm::StringRef origin, Region &regionToClone,
+                   RewriterBase &rewriter, IRMapping &mapping,
+                   ValueRange output, FlatSymbolRefAttr kernelFuncName,
+                   FlatSymbolRefAttr kernelModuleName, Value stream) {
+  SmallVector<Type> resultTypes;
+  for (auto val : output)
+    resultTypes.push_back(val.getType());
+  auto computeRegion =
+      ComputeRegionOp::create(rewriter, loc, resultTypes, launchArgs, inputArgs,
+                              stream, origin, kernelFuncName, kernelModuleName);
+
+  assert(!regionToClone.getBlocks().empty() &&
+         "empty region for acc.compute_region");
+  OpBuilder::InsertionGuard guard(rewriter);
+
+  auto parWidthType = ParWidthType::get(rewriter.getContext());
+  Block *entryBlock = rewriter.createBlock(&computeRegion.getRegion());
+  for (size_t i = 0; i < launchArgs.size(); ++i)
+    entryBlock->addArgument(parWidthType, loc);
+  for (Value input : inputArgs)
+    entryBlock->addArgument(input.getType(), loc);
+  for (size_t i = 0; i < inputArgs.size(); ++i)
+    mapping.map(inputArgs[i], entryBlock->getArgument(launchArgs.size() + i));
+  rewriter.setInsertionPointToStart(entryBlock);
+  if (regionToClone.getBlocks().size() == 1) {
+    for (auto &op : regionToClone.front().getOperations()) {
+      if (op.hasTrait<OpTrait::IsTerminator>())
+        break;
+      rewriter.clone(op, mapping);
+    }
+  } else {
+    auto exeRegion = mlir::acc::wrapMultiBlockRegionWithSCFExecuteRegion(
+        regionToClone, mapping, loc, rewriter);
+    if (!exeRegion) {
+      rewriter.eraseOp(computeRegion);
+      return nullptr;
+    }
+  }
+
+  SmallVector<Value> yieldOperands;
+  for (auto val : output)
+    yieldOperands.push_back(mapping.lookup(val));
+  rewriter.setInsertionPointToEnd(entryBlock);
+  YieldOp::create(rewriter, loc, yieldOperands);
+
+  return computeRegion;
+}
+
 } // namespace acc
 } // namespace mlir
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir
new file mode 100644
index 0000000000000..77c4ba94c4f18
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-compute.mlir
@@ -0,0 +1,107 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// CHECK-LABEL: func.func @parallel_gang_loop
+func.func @parallel_gang_loop(%buf: memref<1xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1_i32 = arith.constant 1 : i32
+  %c10_i32 = arith.constant 10 : i32
+  %c100_i32 = arith.constant 100 : i32
+
+  %dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
+  // CHECK-NOT: acc.parallel
+  // CHECK: acc.kernel_environment
+  // CHECK: acc.par_width {{.*}} {par_dim = #acc.par_dim<block_x>}
+  // CHECK: acc.compute_region launch(
+  // CHECK: scf.parallel
+  // CHECK: acc.par_dims = #acc<par_dims[block_x]>
+  acc.parallel num_gangs({%c10_i32 : i32}) dataOperands(%dev : memref<1xi32>) {
+    acc.loop gang control(%arg0 : i32) = (%c1_i32 : i32) to (%c100_i32 : i32) step (%c1_i32 : i32) {
+      memref.store %arg0, %dev[%c0] : memref<1xi32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel_seq_loop
+func.func @parallel_seq_loop(%buf: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %c10_i32 = arith.constant 10 : i32
+
+  %dev = acc.copyin varPtr(%buf : memref<4xi32>) -> memref<4xi32>
+  // CHECK-NOT: acc.parallel
+  // CHECK: acc.kernel_environment
+  // CHECK: acc.par_width {{.*}} {par_dim = #acc.par_dim<block_x>}
+  // CHECK: acc.compute_region launch(
+  // CHECK: scf.parallel
+  // CHECK: acc.par_dims = #acc<par_dims[sequential]>
+  acc.parallel num_gangs({%c10_i32 : i32}) dataOperands(%dev : memref<4xi32>) {
+    acc.loop control(%i : index) = (%c0 : index) to (%c4 : index) step (%c1 : index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%i] : memref<4xi32>
+      acc.yield
+    } attributes {seq = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<4xi32>) to varPtr(%buf : memref<4xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @serial_loop
+func.func @serial_loop(%buf: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<4xi32>) -> memref<4xi32>
+  // CHECK-NOT: acc.serial
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.parallel
+  // CHECK: acc.par_dims = #acc<par_dims[sequential]>
+  acc.serial dataOperands(%dev : memref<4xi32>) {
+    acc.loop control(%i : index) = (%c0 : index) to (%c4 : index) step (%c1 : index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%i] : memref<4xi32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<4xi32>) to varPtr(%buf : memref<4xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @kernels_loop
+func.func @kernels_loop(%buf: memref<8xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<8xi32>) -> memref<8xi32>
+  // CHECK-NOT: acc.kernels
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.parallel
+  acc.kernels dataOperands(%dev : memref<8xi32>) {
+    acc.loop control(%i : index) = (%c0 : index) to (%c8 : index) step (%c1 : index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%i] : memref<8xi32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.terminator
+  }
+  acc.copyout accPtr(%dev : memref<8xi32>) to varPtr(%buf : memref<8xi32>)
+  return
+}
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
new file mode 100644
index 0000000000000..4a5ea390233e2
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
@@ -0,0 +1,130 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// CHECK-LABEL: func.func @parallel_independent_loop
+func.func @parallel_independent_loop(%buf: memref<16xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c16 = arith.constant 16 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<16xi32>) -> memref<16xi32>
+  // CHECK-NOT: acc.parallel
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.parallel
+  acc.parallel dataOperands(%dev : memref<16xi32>) {
+    acc.loop control(%i : index) = (%c0 : index) to (%c16 : index) step (%c1 : index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%i] : memref<16xi32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<16xi32>) to varPtr(%buf : memref<16xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel_loop_multi_block_body
+func.func @parallel_loop_multi_block_body(%buf: memref<4xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<4xi32>) -> memref<4xi32>
+  // CHECK-NOT: acc.parallel
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.parallel
+  // CHECK: scf.execute_region
+  acc.parallel dataOperands(%dev : memref<4xi32>) {
+    acc.loop control(%i : index) = (%c0 : index) to (%c4 : index) step (%c1 : index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%i] : memref<4xi32>
+      cf.br ^bb1
+    ^bb1:
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<4xi32>) to varPtr(%buf : memref<4xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @parallel_loop_auto_collapse
+func.func @parallel_loop_auto_collapse(%buf: memref<1xi32>, %lb0 : index, %ub0 : index, %lb1 : index, %ub1 : index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
+  // CHECK-NOT: acc.parallel
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.for
+  // CHECK-NOT: scf.for
+  // CHECK-NOT: scf.parallel
+  acc.parallel dataOperands(%dev : memref<1xi32>) {
+    acc.loop control(%i : index, %j : index) = (%lb0, %lb1 : index, index) to (%ub0, %ub1 : index, index) step (%c1, %c1 : index, index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%c0] : memref<1xi32>
+      acc.yield
+    } attributes {auto_ = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @serial_loop_normalized
+func.func @serial_loop_normalized(%buf: memref<1xi32>) {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c5 = arith.constant 5 : index
+  %c9 = arith.constant 9 : index
+
+  %dev = acc.copyin varPtr(%buf : memref<1xi32>) -> memref<1xi32>
+  // CHECK-NOT: acc.serial
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.parallel
+  // CHECK-DAG: arith.muli
+  // CHECK-DAG: arith.addi
+  // CHECK: acc.par_dims = #acc<par_dims[sequential]>
+  acc.serial dataOperands(%dev : memref<1xi32>) {
+    acc.loop control(%i : index) = (%c5 : index) to (%c9 : index) step (%c2 : index) {
+      %vi = arith.index_cast %i : index to i32
+      memref.store %vi, %dev[%c0] : memref<1xi32>
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>]}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<1xi32>) to varPtr(%buf : memref<1xi32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @orphan_loop
+func.func @orphan_loop(%buf: memref<8xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  // CHECK-NOT: acc.loop
+  // CHECK: scf.for
+  // CHECK-NOT: scf.parallel
+  acc.loop control(%i : index) = (%c0 : index) to (%c8 : index) step (%c1 : index) {
+    memref.store %c0_i32, %buf[%i] : memref<8xi32>
+    acc.yield
+  } attributes {independent = [#acc.device_type<none>]}
+  return
+}
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir
new file mode 100644
index 0000000000000..f22d7872ecc32
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-unstructured.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-opt %s -acc-compute-lowering | FileCheck %s
+
+// CHECK-LABEL: func.func @parallel_unstructured_loop
+func.func @parallel_unstructured_loop(%buf: memref<10xi32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %c1_i32 = arith.constant 1 : i32
+
+  %dev = acc.copyin varPtr(%buf : memref<10xi32>) -> memref<10xi32>
+  // CHECK-NOT: acc.loop
+  // CHECK: acc.kernel_environment
+  // CHECK-NOT: acc.par_width
+  // CHECK: acc.compute_region
+  // CHECK: scf.execute_region
+  acc.parallel dataOperands(%dev : memref<10xi32>) {
+    acc.loop {
+    ^entry:
+      cf.br ^header(%c0 : index)
+    ^header(%iv: index):
+      %cond = arith.cmpi ult, %iv, %c10 : index
+      cf.cond_br %cond, ^body, ^exit
+    ^body:
+      memref.store %c1_i32, %dev[%iv] : memref<10xi32>
+      %iv_next = arith.addi %iv, %c1 : index
+      cf.br ^header(%iv_next : index)
+    ^exit:
+      acc.yield
+    } attributes {independent = [#acc.device_type<none>], unstructured}
+    acc.yield
+  }
+  acc.copyout accPtr(%dev : memref<10xi32>) to varPtr(%buf : memref<10xi32>)
+  return
+}
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
index b2d5409f495f5..d56054a0b5877 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
@@ -7,9 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/OpenACC/OpenACCUtilsCG.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "gtest/gtest.h"
@@ -24,7 +29,9 @@ using namespace mlir::acc;
 class OpenACCUtilsCGTest : public ::testing::Test {
 protected:
   OpenACCUtilsCGTest() : b(&context), loc(UnknownLoc::get(&context)) {
-    context.loadDialect<acc::OpenACCDialect, DLTIDialect>();
+    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect,
+                        func::FuncDialect, scf::SCFDialect, gpu::GPUDialect,
+                        DLTIDialect>();
   }
 
   MLIRContext context;
@@ -74,3 +81,71 @@ TEST_F(OpenACCUtilsCGTest, getDataLayoutWithSpec) {
   auto dl2 = getDataLayout(module->getOperation(), /*allowDefault=*/true);
   EXPECT_TRUE(dl2.has_value());
 }
+
+//===----------------------------------------------------------------------===//
+// buildComputeRegion Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsCGTest, buildComputeRegionEmpty) {
+  OwningOpRef<ModuleOp> module = ModuleOp::create(b, loc);
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPointToEnd(module->getBody());
+
+  auto funcTy = b.getFunctionType({}, {});
+  auto func = func::FuncOp::create(rewriter, loc, "test", funcTy);
+  Block *entry = func.addEntryBlock();
+  rewriter.setInsertionPointToStart(entry);
+
+  Region sourceRegion;
+  Block *srcBlock = new Block();
+  sourceRegion.push_back(srcBlock);
+  OpBuilder srcBuilder(&context);
+  srcBuilder.setInsertionPointToStart(srcBlock);
+  YieldOp::create(srcBuilder, loc);
+
+  IRMapping mapping;
+  auto cr =
+      buildComputeRegion(loc, /*launchArgs=*/{}, /*inputArgs=*/{},
+                        SerialOp::getOperationName(), sourceRegion, rewriter,
+                        mapping);
+
+  EXPECT_EQ(cr.getOrigin(), SerialOp::getOperationName());
+  EXPECT_EQ(cr.getLaunchArgs().size(), 0u);
+  EXPECT_EQ(cr.getInputArgs().size(), 0u);
+  EXPECT_TRUE(cr.getRegion().hasOneBlock());
+
+  func::ReturnOp::create(rewriter, loc);
+}
+
+TEST_F(OpenACCUtilsCGTest, buildComputeRegionWithLaunchArgs) {
+  OwningOpRef<ModuleOp> module = ModuleOp::create(b, loc);
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPointToEnd(module->getBody());
+
+  auto funcTy = b.getFunctionType({}, {});
+  auto func = func::FuncOp::create(rewriter, loc, "test", funcTy);
+  Block *entry = func.addEntryBlock();
+  rewriter.setInsertionPointToStart(entry);
+
+  auto c128 = arith::ConstantIndexOp::create(rewriter, loc, 128);
+  auto threadXDim = GPUParallelDimAttr::threadXDim(&context);
+  auto pw = ParWidthOp::create(rewriter, loc, c128, threadXDim);
+
+  Region sourceRegion;
+  Block *srcBlock = new Block();
+  sourceRegion.push_back(srcBlock);
+  OpBuilder srcBuilder(&context);
+  srcBuilder.setInsertionPointToStart(srcBlock);
+  YieldOp::create(srcBuilder, loc);
+
+  IRMapping mapping;
+  auto cr = buildComputeRegion(loc, {pw}, /*inputArgs=*/{},
+                              ParallelOp::getOperationName(), sourceRegion,
+                              rewriter, mapping);
+
+  EXPECT_EQ(cr.getOrigin(), ParallelOp::getOperationName());
+  EXPECT_EQ(cr.getLaunchArgs().size(), 1u);
+  EXPECT_EQ(cr.getLaunchArgs()[0], pw.getResult());
+
+  func::ReturnOp::create(rewriter, loc);
+}

>From 28a10ff67b56cffb0e10b24eb31aa0916cdefb50 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Mon, 9 Mar 2026 12:55:59 -0700
Subject: [PATCH 2/3] Fix format

---
 mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h    |  2 +-
 mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp             |  5 +++--
 mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp | 11 +++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
index b5fdcca761a4a..ba453b11492f3 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsCG.h
@@ -15,8 +15,8 @@
 #define MLIR_DIALECT_OPENACC_OPENACCUTILSCG_H_
 
 #include "mlir/Dialect/OpenACC/OpenACC.h"
-#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include <optional>
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
index ba677082ba4e2..d57c65694da61 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACCCG.cpp
@@ -219,8 +219,9 @@ void KernelEnvironmentOp::getCanonicalizationPatterns(
 }
 
 template <typename ComputeConstructT>
-KernelEnvironmentOp KernelEnvironmentOp::createAndPopulate(
-    ComputeConstructT computeConstruct, OpBuilder &builder) {
+KernelEnvironmentOp
+KernelEnvironmentOp::createAndPopulate(ComputeConstructT computeConstruct,
+                                       OpBuilder &builder) {
   auto kernelEnvironment = KernelEnvironmentOp::create(
       builder, computeConstruct->getLoc(),
       computeConstruct.getDataClauseOperands(),
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
index d56054a0b5877..671fa6c5560eb 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsCGTest.cpp
@@ -104,10 +104,9 @@ TEST_F(OpenACCUtilsCGTest, buildComputeRegionEmpty) {
   YieldOp::create(srcBuilder, loc);
 
   IRMapping mapping;
-  auto cr =
-      buildComputeRegion(loc, /*launchArgs=*/{}, /*inputArgs=*/{},
-                        SerialOp::getOperationName(), sourceRegion, rewriter,
-                        mapping);
+  auto cr = buildComputeRegion(loc, /*launchArgs=*/{}, /*inputArgs=*/{},
+                               SerialOp::getOperationName(), sourceRegion,
+                               rewriter, mapping);
 
   EXPECT_EQ(cr.getOrigin(), SerialOp::getOperationName());
   EXPECT_EQ(cr.getLaunchArgs().size(), 0u);
@@ -140,8 +139,8 @@ TEST_F(OpenACCUtilsCGTest, buildComputeRegionWithLaunchArgs) {
 
   IRMapping mapping;
   auto cr = buildComputeRegion(loc, {pw}, /*inputArgs=*/{},
-                              ParallelOp::getOperationName(), sourceRegion,
-                              rewriter, mapping);
+                               ParallelOp::getOperationName(), sourceRegion,
+                               rewriter, mapping);
 
   EXPECT_EQ(cr.getOrigin(), ParallelOp::getOperationName());
   EXPECT_EQ(cr.getLaunchArgs().size(), 1u);

>From b3b0985ea6e86c9041e151380d0730ecbb666829 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Tue, 10 Mar 2026 10:46:42 -0700
Subject: [PATCH 3/3] Convert to scf.parallel in specialized routine

---
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h   |  4 +--
 .../OpenACC/Transforms/ACCComputeLowering.cpp |  5 +++-
 .../OpenACC/acc-compute-lowering-loop.mlir    | 25 +++++++++++++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index fe5b42807236f..55fc8251a9bbd 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -189,13 +189,13 @@ static constexpr StringLiteral getSpecializedRoutineAttrName() {
 /// Used to check whether the current operation is marked with
 /// `acc routine`. The operation passed in should be a function.
 inline bool isAccRoutine(mlir::Operation *op) {
-  return op->hasAttr(mlir::acc::getRoutineInfoAttrName());
+  return op && op->hasAttr(mlir::acc::getRoutineInfoAttrName());
 }
 
 /// Used to check whether this is a specialized accelerator version of
 /// `acc routine` function.
 inline bool isSpecializedAccRoutine(mlir::Operation *op) {
-  return op->hasAttr(mlir::acc::getSpecializedRoutineAttrName());
+  return op && op->hasAttr(mlir::acc::getSpecializedRoutineAttrName());
 }
 
 static constexpr StringLiteral getFromDefaultClauseAttrName() {
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
index 8bb5dc9eb43d7..60787dd9c0be5 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCComputeLowering.cpp
@@ -50,6 +50,7 @@
 #include "mlir/Dialect/OpenACC/OpenACCUtilsLoop.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/IRMapping.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/RegionUtils.h"
 
@@ -252,7 +253,9 @@ class ACCLoopConversion : public OpRewritePattern<LoopOp> {
       if (!forOp)
         return failure();
       rewriter.replaceOp(loopOp, forOp);
-    } else if (!isOpInComputeRegion(loopOp)) {
+    } else if (!isOpInComputeRegion(loopOp) &&
+               !isSpecializedAccRoutine(
+                   loopOp->getParentOfType<FunctionOpInterface>())) {
       // This loop is an orphan `acc loop` but it is not in any sort
       // of compute region. Thus it is just a sequential non-accelerator loop.
       auto forOp =
diff --git a/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
index 4a5ea390233e2..69f9e748a92d7 100644
--- a/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
+++ b/mlir/test/Dialect/OpenACC/acc-compute-lowering-loop.mlir
@@ -128,3 +128,28 @@ func.func @orphan_loop(%buf: memref<8xi32>) {
   } attributes {independent = [#acc.device_type<none>]}
   return
 }
+
+// -----
+
+// Loop in specialized acc routine: should not be treated as orphan (scf.for)
+// but converted to scf.parallel when independent. With vector tag, the
+// scf.parallel gets acc.par_dims = thread_x (vector dimension).
+acc.routine @routine_with_loop func(@device_routine_with_loop) seq
+// CHECK-LABEL: func.func @device_routine_with_loop
+// CHECK: attributes {acc.specialized_routine = #acc.specialized_routine<@routine_with_loop, <seq>, "host_routine_with_loop">}
+// CHECK-NOT: acc.loop
+// CHECK: scf.parallel
+// CHECK: acc.par_dims = #acc<par_dims[thread_x]>
+// CHECK-NOT: scf.for
+func.func @device_routine_with_loop(%buf: memref<8xi32>) attributes {acc.specialized_routine = #acc.specialized_routine<@routine_with_loop, <seq>, "host_routine_with_loop">} {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c8 = arith.constant 8 : index
+  %c0_i32 = arith.constant 0 : i32
+
+  acc.loop control(%i : index) = (%c0 : index) to (%c8 : index) step (%c1 : index) {
+    memref.store %c0_i32, %buf[%i] : memref<8xi32>
+    acc.yield
+  } attributes {independent = [#acc.device_type<none>], vector = [#acc.device_type<none>]}
+  return
+}