[Mlir-commits] [mlir] b85cf95 - [mlir][acc] Move acc routine functions into GPU module (#187161)

Wed Mar 18 07:46:04 PDT 2026

Author: Razvan Lupusoru
Date: 2026-03-18T07:45:56-07:00
New Revision: b85cf95aad712879e8e5234a7eb59544ad7b1cca

URL: https://github.com/llvm/llvm-project/commit/b85cf95aad712879e8e5234a7eb59544ad7b1cca
DIFF: https://github.com/llvm/llvm-project/commit/b85cf95aad712879e8e5234a7eb59544ad7b1cca.diff

LOG: [mlir][acc] Move acc routine functions into GPU module (#187161)

The OpenACC routine directive defines functions that may be called from
device code; those functions (and any device-required callees) must be
present in the device compilation unit. This PR introduces
ACCRoutineToGPUFunc pass which moves materialized acc routines into the
GPU module as gpu.func so they can be compiled for the device.

This adds testing showing the pass on both MLIR and FIR. The FIR tests
required improvements in OpenACCSupport implementation to ensure that
CUF and Fortran runtime is considered as legal for GPU.

Added: 
    flang/test/Fir/OpenACC/acc-routine-to-gpu-func.fir
    mlir/lib/Dialect/OpenACC/Transforms/ACCRoutineToGPUFunc.cpp
    mlir/test/Dialect/OpenACC/acc-routine-to-gpu-func.mlir

Modified: 
    flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
    flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
    mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
    mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
index f5d44c7968b1d..c2aaaee13419c 100644

--- a/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
+++ b/flang/include/flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h
@@ -14,6 +14,7 @@
 #define FORTRAN_OPTIMIZER_OPENACC_ANALYSIS_FIROPENACCSUPPORTANALYSIS_H
 
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/Value.h"
 #include <string>
@@ -45,6 +46,9 @@ class FIROpenACCSupportAnalysis {
   mlir::InFlightDiagnostic emitNYI(mlir::Location loc,
                                    const mlir::Twine &message);
 
+  bool isValidSymbolUse(mlir::Operation *user, mlir::SymbolRefAttr symbol,
+                        mlir::Operation **definingOpPtr);
+
   bool isValidValueUse(mlir::Value v, mlir::Region &region);
 };
 

diff  --git a/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
index 3ad3188314fbc..cf4c38453c8ee 100644
--- a/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
+++ b/flang/lib/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.cpp
@@ -13,6 +13,7 @@
 #include "flang/Optimizer/OpenACC/Analysis/FIROpenACCSupportAnalysis.h"
 
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/OpenACC/Support/FIROpenACCUtils.h"
 #include "mlir/Dialect/OpenACC/OpenACCUtils.h"
@@ -39,6 +40,39 @@ FIROpenACCSupportAnalysis::emitNYI(Location loc, const Twine &message) {
   return mlir::emitError(loc, "not yet implemented: " + message.str());
 }
 
+bool FIROpenACCSupportAnalysis::isValidSymbolUse(Operation *user,
+                                                 SymbolRefAttr symbol,
+                                                 Operation **definingOpPtr) {
+  // First check using the default OpenACC utility (recipes, device globals,
+  // acc routine, LLVM intrinsics, declare attribute).
+  Operation *definingOp = nullptr;
+  if (mlir::acc::isValidSymbolUse(user, symbol, &definingOp)) {
+    if (definingOpPtr)
+      *definingOpPtr = definingOp;
+    return true;
+  }
+
+  // Default said no; if we have no defining op, nothing more to check.
+  if (!definingOp)
+    return false;
+  if (definingOpPtr)
+    *definingOpPtr = definingOp;
+
+  // Functions marked as Fortran runtime are valid (GPU version expected
+  // to be offloaded).
+  if (definingOp->hasAttr("fir.runtime"))
+    return true;
+
+  // Functions with CUF device/global/host_device attribute are valid.
+  if (auto cufProcAttr = definingOp->getAttrOfType<cuf::ProcAttributeAttr>(
+          cuf::getProcAttrName())) {
+    if (cufProcAttr.getValue() != cuf::ProcAttribute::Host)
+      return true;
+  }
+
+  return false;
+}
+
 bool FIROpenACCSupportAnalysis::isValidValueUse(Value v, Region &region) {
   // First check using the base utility.
   if (mlir::acc::isValidValueUse(v, region))

diff  --git a/flang/test/Fir/OpenACC/acc-routine-to-gpu-func.fir b/flang/test/Fir/OpenACC/acc-routine-to-gpu-func.fir
new file mode 100644
index 0000000000000..d31b021b18cea
--- /dev/null
+++ b/flang/test/Fir/OpenACC/acc-routine-to-gpu-func.fir
@@ -0,0 +1,103 @@
+// RUN: fir-opt %s -acc-initialize-fir-analyses -acc-routine-to-gpu-func -split-input-file | FileCheck %s
+
+// CHECK: gpu.module @acc_gpu_module
+// CHECK: gpu.func @_QPfoo
+// CHECK: fir.store
+// CHECK: gpu.return
+acc.routine @routine_seq func(@_QPfoo) seq
+func.func @_QPfoo(%arg0: !fir.ref<i32> {fir.bindc_name = "x"}) attributes {acc.specialized_routine = #acc.specialized_routine<@routine_seq, <seq>, "_QPfoo">} {
+  %c0 = arith.constant 0 : i32
+  fir.store %c0 to %arg0 : !fir.ref<i32>
+  return
+}
+
+// -----
+
+// One routine with body, one declaration-only; both end up in GPU module.
+// CHECK: acc.routine @acc_routine_0
+// CHECK: acc.routine @acc_routine_1
+// CHECK: gpu.module @{{.*}}
+// CHECK-NEXT: gpu.func @devicefunc(){{.*}} {
+// CHECK-NEXT: gpu.return
+// CHECK-NEXT: }
+// CHECK-NEXT: func.func private @declfunc()
+module {
+  acc.routine @acc_routine_0 func(@devicefunc)
+  func.func @devicefunc() attributes {acc.specialized_routine = #acc.specialized_routine<@acc_routine_0, <seq>, "devicefunc">} {
+    return
+  }
+  acc.routine @acc_routine_1 func(@declfunc)
+  func.func private @declfunc() -> ()
+}
+
+// -----
+
+// nohost routine: host copy is removed after moving to GPU module.
+// CHECK: acc.routine @acc_routine_0
+// CHECK-NOT: func.func @_QMmPfv_0
+module {
+  acc.routine @acc_routine_0 func(@_QMmPfv_0) vector nohost
+  func.func @_QMmPfv_0(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}) attributes {acc.specialized_routine = #acc.specialized_routine<@acc_routine_0, <vector>, "_QMmPfv_0">} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.declare %arg0 dummy_scope %0 {uniq_name = "_QMmFfvEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
+    return
+  }
+}
+
+// -----
+
+// Routine that calls another acc routine: both end up in GPU module; call kept.
+// CHECK: acc.routine @r_outer
+// CHECK: acc.routine @r_inner
+// CHECK: gpu.module @{{.*}}
+// CHECK: gpu.func @outer
+// CHECK: fir.call @inner
+// CHECK: gpu.return
+// CHECK: gpu.func @inner
+module {
+  acc.routine @r_outer func(@outer) seq
+  acc.routine @r_inner func(@inner) seq
+  func.func @outer() attributes {acc.specialized_routine = #acc.specialized_routine<@r_outer, <seq>, "outer">} {
+    fir.call @inner() : () -> ()
+    return
+  }
+  func.func @inner() attributes {acc.specialized_routine = #acc.specialized_routine<@r_inner, <seq>, "inner">} {
+    return
+  }
+}
+
+// -----
+
+// Routine calls a Fortran runtime function (fir.runtime): valid for device, decl in GPU module.
+// CHECK: acc.routine @r_seq
+// CHECK: gpu.module @{{.*}}
+// CHECK: gpu.func @_QPcalls_runtime
+// CHECK: fir.call @_FortranASomeRuntime
+// CHECK: gpu.return
+// CHECK: func.func private @_FortranASomeRuntime
+module {
+  acc.routine @r_seq func(@_QPcalls_runtime) seq
+  func.func @_QPcalls_runtime() attributes {acc.specialized_routine = #acc.specialized_routine<@r_seq, <seq>, "_QPcalls_runtime">} {
+    fir.call @_FortranASomeRuntime() : () -> ()
+    return
+  }
+  func.func private @_FortranASomeRuntime() -> () attributes {fir.runtime}
+}
+
+// -----
+
+// Routine calls a CUF device procedure (cuf.proc_attr = device): valid for device, decl in GPU module.
+// CHECK: acc.routine @r_seq
+// CHECK: gpu.module @{{.*}}
+// CHECK: gpu.func @_QPcalls_cuf
+// CHECK: fir.call @_QPcuf_device
+// CHECK: gpu.return
+// CHECK: func.func private @_QPcuf_device
+module {
+  acc.routine @r_seq func(@_QPcalls_cuf) seq
+  func.func @_QPcalls_cuf() attributes {acc.specialized_routine = #acc.specialized_routine<@r_seq, <seq>, "_QPcalls_cuf">} {
+    fir.call @_QPcuf_device() : () -> ()
+    return
+  }
+  func.func private @_QPcuf_device() -> () attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
+}

diff  --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 3dfbca478c16b..bb98759457166 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -475,4 +475,15 @@ def ACCRoutineLowering : Pass<"acc-routine-lowering", "mlir::ModuleOp"> {
   let options = [ AccDeviceTypeOption ];
 }
 
+def ACCRoutineToGPUFunc : Pass<"acc-routine-to-gpu-func", "mlir::ModuleOp"> {
+  let summary = "Move ACC routine functions into the GPU module as gpu.func";
+  let description = [{
+    This pass moves functions associated with `acc routine` (and any callees
+    that must be present on the device) into the GPU module as `gpu.func`
+    operations.
+  }];
+  let dependentDialects = ["mlir::gpu::GPUDialect"];
+  let options = [ AccDeviceTypeOption ];
+}
+
 #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES

diff  --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCRoutineToGPUFunc.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCRoutineToGPUFunc.cpp
new file mode 100644
index 0000000000000..27a34695decb2
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCRoutineToGPUFunc.cpp
@@ -0,0 +1,328 @@
+//===- ACCRoutineToGPUFunc.cpp - Move ACC routines to GPU module ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The OpenACC `routine` directive defines functions that may be invoked from
+// device code. Those functions need to be available in the device compilation
+// unit. This pass moves materialized acc routines into the GPU module as
+// gpu.func operations so they can be compiled for the device.
+//
+// Overview:
+// ---------
+// For each acc.routine that is not bound by name, the corresponding
+// specialized function (created by ACCRoutineLowering) or the original
+// host function (in case of seq) is cloned into theGPU module as a gpu.func.
+// Callees referenced from those routines are processed: device-valid callees
+// (runtime, intrinsics, other acc routines) are added to the GPU module as
+// declarations or full clones as needed. Bind-name routines are not moved;
+// their acc.routine ops are erased. After cloning, the host copies of
+// specialized device functions and nohost routines are removed.
+//
+// Approach:
+// ----------------
+// 1. Collect materialized routines (acc.routine without bind(name)); record
+//    bind-name routines for erasure. Emit remarks for materialized routines.
+//
+// 2. Process calls: walk each materialized function; for each call, if the
+//    callee is already in the GPU module or is an acc routine (or specialized
+//    acc routine), skip; otherwise require OpenACCSupport::isValidSymbolUse.
+//    Valid callees are added to the clone set (as declaration or full clone).
+//
+// 3. Clone into GPU module: each function in the clone set is turned into a
+//    gpu.func (body cloned or declaration only). acc.specialized_routine is
+//    preserved and symbol uses are updated so the routine name is unchanged.
+//
+// 4. Cleanup: erase from the host module the specialized device function
+//    bodies and any nohost routine (host copy removed after move to device).
+//
+// Example:
+// --------
+// Before (after ACCRoutineLowering):
+//   acc.routine @r_seq func(@foo) seq
+//   func.func @foo() attributes {acc.specialized_routine = ...} { ... }
+//
+// After:
+//   acc.routine @r_seq func(@foo) seq
+//   gpu.module @acc_gpu_module {
+//     gpu.func @foo() attributes {acc.specialized_routine = ...} { ... }
+//   }
+//   (host @foo erased)
+//
+// Requirements:
+// -------------
+// - Must run after `ACCRoutineLowering` pass  which ensures variants for all
+//   levels of parallelism are created.
+// - Uses OpenACCSupport: getOrCreateGPUModule, isValidSymbolUse, emitRemark,
+//   emitNYI. If no custom implementation is registered, the default is used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "llvm/ADT/SetVector.h"
+#include <string>
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCROUTINETOGPUFUNC
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+#define DEBUG_TYPE "acc-routine-to-gpu-func"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+/// Create a gpu.func from a func.func by cloning the body.
+static gpu::GPUFuncOp createGPUFuncFromFunc(OpBuilder &builder,
+                                            func::FuncOp sourceFunc) {
+  Location loc = sourceFunc.getLoc();
+  StringRef name = sourceFunc.getName();
+  FunctionType type = sourceFunc.getFunctionType();
+  // Do not copy any attributes from the source; specialized_routine is set
+  // later when applicable.
+  gpu::GPUFuncOp gpuFunc =
+      gpu::GPUFuncOp::create(builder, loc, name, type,
+                             /*workgroupAttributions=*/TypeRange(),
+                             /*privateAttributions=*/TypeRange(), /*attrs=*/{});
+
+  Region &sourceBody = sourceFunc.getBody();
+  Region &deviceBody = gpuFunc.getBody();
+  Block &deviceEntryBlock = deviceBody.front();
+
+  // Map source block arguments to the GPU func's entry block arguments (which
+  // GPUFuncOp::create already created).
+  IRMapping mapping;
+  Block &sourceEntryBlock = sourceBody.front();
+  for (auto [srcArg, destArg] : llvm::zip(sourceEntryBlock.getArguments(),
+                                          deviceEntryBlock.getArguments()))
+    mapping.map(srcArg, destArg);
+
+  sourceBody.cloneInto(&deviceBody, mapping);
+
+  // Replace func.return with gpu.return in the cloned blocks.
+  gpuFunc.walk([](func::ReturnOp op) {
+    OpBuilder replacer(op);
+    gpu::ReturnOp gpuReturn = gpu::ReturnOp::create(replacer, op.getLoc());
+    gpuReturn->setOperands(op.getOperands());
+    op.erase();
+  });
+
+  // Splice the cloned entry block's operations into the GPU func's entry block
+  // (cloneInto created a separate block for the cloned content), then remove
+  // the now-empty cloned block.
+  Block *clonedSourceEntry = mapping.lookup(&sourceEntryBlock);
+  deviceEntryBlock.getOperations().splice(
+      deviceEntryBlock.getOperations().end(),
+      clonedSourceEntry->getOperations());
+  clonedSourceEntry->erase();
+
+  return gpuFunc;
+}
+
+using CloneCandidate = std::pair<func::FuncOp, RoutineOp>;
+
+/// Collect materialized and bind routines; fill candidate func names and
+/// materialized routine set. Emit remarks for materialized routines.
+static void collectRoutineCandidates(
+    ModuleOp mod, SymbolTable &symTab, acc::DeviceType deviceType,
+    OpenACCSupport &accSupport,
+    llvm::SmallSetVector<llvm::StringRef, 4> &funcsToCloneCandidates,
+    llvm::SmallSetVector<RoutineOp, 4> &materializedAccRoutines,
+    llvm::SmallSetVector<RoutineOp, 4> &bindAccRoutines) {
+  auto isParallelRoutine = [deviceType](RoutineOp routineOp) {
+    return routineOp.hasGang(deviceType) || routineOp.hasGang() ||
+           routineOp.hasWorker(deviceType) || routineOp.hasWorker() ||
+           routineOp.hasVector(deviceType) || routineOp.hasVector() ||
+           routineOp.getGangDimValue(deviceType) || routineOp.getGangDimValue();
+  };
+
+  mod.walk([&](RoutineOp op) {
+    if (op.getBindNameValue() || op.getBindNameValue(deviceType)) {
+      bindAccRoutines.insert(op);
+      return;
+    }
+    func::FuncOp callee =
+        symTab.lookup<func::FuncOp>(op.getFuncName().getLeafReference());
+    accSupport.emitRemark(
+        callee ? callee.getOperation() : op.getOperation(),
+        [&op, &isParallelRoutine]() {
+          std::string msg = "Generating";
+          if (op.getImplicitAttr())
+            msg += " implicit";
+          msg += " acc routine";
+          if (!isParallelRoutine(op))
+            msg += " seq";
+          return msg;
+        },
+        DEBUG_TYPE);
+    funcsToCloneCandidates.insert(op.getFuncName().getLeafReference());
+    materializedAccRoutines.insert(op);
+  });
+}
+
+/// Process calls in ACC routines: add valid callees to funcsToClone (for
+/// declaration or clone). Returns failure() if any call is unsupported.
+static LogicalResult processCallsInRoutines(
+    SymbolTable &symTab, SymbolTable &gpuSymTab, OpenACCSupport &accSupport,
+    const llvm::SmallSetVector<llvm::StringRef, 4> &funcsToCloneCandidates,
+    const llvm::SmallSetVector<RoutineOp, 4> &materializedAccRoutines,
+    llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
+  LogicalResult callCheckResult = success();
+  auto processCalls = [&](CallOpInterface callOp) {
+    if (!callOp.getCallableForCallee())
+      return;
+    auto calleeSymbolRef =
+        dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+    if (!calleeSymbolRef)
+      return;
+
+    auto callee =
+        symTab.lookup<func::FuncOp>(calleeSymbolRef.getLeafReference());
+    if (!callee)
+      return;
+
+    if (gpuSymTab.lookup(callee.getName()))
+      return;
+    if (isAccRoutine(callee) || isSpecializedAccRoutine(callee))
+      return;
+
+    if (!accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef)) {
+      accSupport.emitNYI(callOp->getLoc(), "Unsupported call in acc routine");
+      callCheckResult = failure();
+      return;
+    }
+    funcsToClone.insert({callee, RoutineOp{}});
+  };
+
+  for (auto [funcName, accRoutine] :
+       llvm::zip(funcsToCloneCandidates, materializedAccRoutines)) {
+    func::FuncOp func = symTab.lookup<func::FuncOp>(funcName);
+    if (!func)
+      continue;
+    if (!gpuSymTab.lookup(funcName))
+      funcsToClone.insert({func, accRoutine});
+    func.walk([&](CallOpInterface callOp) { processCalls(callOp); });
+    if (failed(callCheckResult))
+      return failure();
+  }
+  return success();
+}
+
+/// Clone each function in funcsToClone into the GPU module (declaration or
+/// full body). Fix up symbol names and specialized_routine attr for ACC
+/// routines.
+static LogicalResult cloneFuncsToGPUModule(
+    ModuleOp mod, OpenACCSupport &accSupport, SymbolTable &gpuSymTab,
+    const llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
+  MLIRContext *ctx = mod.getContext();
+  OpBuilder builder(ctx);
+
+  for (CloneCandidate candidate : funcsToClone) {
+    func::FuncOp srcFunc = candidate.first;
+
+    if (srcFunc.isDeclaration()) {
+      Operation *cloned = srcFunc->clone();
+      gpuSymTab.insert(cloned);
+      continue;
+    }
+
+    gpu::GPUFuncOp deviceFuncOp = createGPUFuncFromFunc(builder, srcFunc);
+
+    if (auto specRoutineAttr = srcFunc->getAttrOfType<SpecializedRoutineAttr>(
+            getSpecializedRoutineAttrName())) {
+      StringAttr funcName = specRoutineAttr.getFuncName();
+      if (failed(SymbolTable::replaceAllSymbolUses(
+              StringAttr::get(ctx, deviceFuncOp.getName()), funcName, mod))) {
+        accSupport.emitNYI(deviceFuncOp.getLoc(),
+                           "cannot replace symbol for acc routine");
+        return failure();
+      }
+      deviceFuncOp->setAttr(SymbolTable::getSymbolAttrName(), funcName);
+    }
+    if (auto specAttr = srcFunc->getAttrOfType<SpecializedRoutineAttr>(
+            getSpecializedRoutineAttrName()))
+      deviceFuncOp->setAttr(getSpecializedRoutineAttrName(), specAttr);
+
+    gpuSymTab.insert(deviceFuncOp);
+  }
+  return success();
+}
+
+/// Remove specialized device copies and nohost routines from the host module.
+static void
+cleanupHostModule(const llvm::SmallSetVector<CloneCandidate, 4> &funcsToClone) {
+  for (CloneCandidate candidate : funcsToClone) {
+    func::FuncOp funcCandidate = candidate.first;
+    RoutineOp routineCandidate = candidate.second;
+    if ((routineCandidate && routineCandidate.getNohost()) ||
+        acc::isSpecializedAccRoutine(funcCandidate))
+      funcCandidate.erase();
+  }
+}
+
+class ACCRoutineToGPUFunc
+    : public acc::impl::ACCRoutineToGPUFuncBase<ACCRoutineToGPUFunc> {
+public:
+  using acc::impl::ACCRoutineToGPUFuncBase<
+      ACCRoutineToGPUFunc>::ACCRoutineToGPUFuncBase;
+
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+    if (mod.getOps<RoutineOp>().empty()) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Skipping ACCRoutineToGPUFunc - no acc.routine ops\n");
+      return;
+    }
+
+    OpenACCSupport &accSupport = getAnalysis<OpenACCSupport>();
+    std::optional<gpu::GPUModuleOp> gpuModOpt =
+        accSupport.getOrCreateGPUModule(mod);
+    if (!gpuModOpt) {
+      accSupport.emitNYI(mod.getLoc(), "Failed to create GPU module");
+      return signalPassFailure();
+    }
+    gpu::GPUModuleOp gpuMod = *gpuModOpt;
+
+    SymbolTable symTab(mod);
+    SymbolTable gpuSymTab(gpuMod);
+
+    llvm::SmallSetVector<llvm::StringRef, 4> funcsToCloneCandidates;
+    llvm::SmallSetVector<RoutineOp, 4> materializedAccRoutines;
+    llvm::SmallSetVector<RoutineOp, 4> bindAccRoutines;
+
+    collectRoutineCandidates(mod, symTab, this->deviceType, accSupport,
+                             funcsToCloneCandidates, materializedAccRoutines,
+                             bindAccRoutines);
+
+    llvm::SmallSetVector<CloneCandidate, 4> funcsToClone;
+    if (failed(processCallsInRoutines(symTab, gpuSymTab, accSupport,
+                                      funcsToCloneCandidates,
+                                      materializedAccRoutines, funcsToClone)))
+      return signalPassFailure();
+
+    if (failed(cloneFuncsToGPUModule(mod, accSupport, gpuSymTab, funcsToClone)))
+      return signalPassFailure();
+
+    cleanupHostModule(funcsToClone);
+    for (RoutineOp bindOp : bindAccRoutines)
+      bindOp.erase();
+  }
+};
+
+} // namespace

diff  --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index 2e81988b6610b..eb4eecfff129f 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_mlir_dialect_library(MLIROpenACCTransforms
   ACCComputeLowering.cpp
   ACCRoutineLowering.cpp
+  ACCRoutineToGPUFunc.cpp
   ACCDeclareGPUModuleInsertion.cpp
   ACCIfClauseLowering.cpp
   ACCImplicitData.cpp

diff  --git a/mlir/test/Dialect/OpenACC/acc-routine-to-gpu-func.mlir b/mlir/test/Dialect/OpenACC/acc-routine-to-gpu-func.mlir
new file mode 100644
index 0000000000000..88d5fcd7f10e7
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/acc-routine-to-gpu-func.mlir
@@ -0,0 +1,60 @@
+// RUN: mlir-opt %s -acc-routine-to-gpu-func -split-input-file | FileCheck %s
+
+// CHECK: gpu.module @acc_gpu_module {
+// CHECK: gpu.func @host_foo
+// CHECK: memref.store
+// CHECK: gpu.return
+acc.routine @routine_seq func(@host_foo) seq
+func.func @host_foo(%buf: memref<8xi32>) attributes {acc.specialized_routine = #acc.specialized_routine<@routine_seq, <seq>, "host_foo">} {
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  memref.store %c0_i32, %buf[%c0] : memref<8xi32>
+  return
+}
+
+// -----
+
+// Bind routine is erased; materialized routine is moved to GPU module.
+// CHECK: gpu.module @acc_gpu_module {
+// CHECK: gpu.func @host_foo
+// CHECK: gpu.return
+// CHECK-NOT: acc.routine @routine_bind
+acc.routine @routine_seq func(@host_foo) seq
+acc.routine @routine_bind func(@host_bind) seq bind("myname")
+func.func @host_foo() attributes {acc.specialized_routine = #acc.specialized_routine<@routine_seq, <seq>, "host_foo">} {
+  return
+}
+func.func @host_bind() {
+  return
+}
+
+// -----
+
+// One routine with body, one declaration-only; both end up in GPU module.
+// CHECK: acc.routine @acc_routine_0
+// CHECK: acc.routine @acc_routine_1
+// CHECK: gpu.module @{{.*}}
+// CHECK-NEXT: gpu.func @devicefunc(){{.*}} {
+// CHECK-NEXT: gpu.return
+// CHECK-NEXT: }
+// CHECK-NEXT: func.func private @declfunc()
+module {
+  acc.routine @acc_routine_0 func(@devicefunc)
+  func.func @devicefunc() attributes {acc.specialized_routine = #acc.specialized_routine<@acc_routine_0, <seq>, "devicefunc">} {
+    return
+  }
+  acc.routine @acc_routine_1 func(@declfunc)
+  func.func private @declfunc() -> ()
+}
+
+// -----
+
+// nohost routine: host copy is removed after moving to GPU module.
+// CHECK: acc.routine @acc_routine_0
+// CHECK-NOT: func.func @nohost_vec
+module {
+  acc.routine @acc_routine_0 func(@nohost_vec) vector nohost
+  func.func @nohost_vec(%arg0: i32) attributes {acc.specialized_routine = #acc.specialized_routine<@acc_routine_0, <vector>, "nohost_vec">} {
+    return
+  }
+}