[flang-commits] [flang] [flang][cuda] Avoid inserting GetDeviceAddress call in offload regions (PR #175225)
Razvan Lupusoru via flang-commits
flang-commits at lists.llvm.org
Fri Jan 9 11:13:46 PST 2026
https://github.com/razvanlupusoru created https://github.com/llvm/llvm-project/pull/175225
In regions destined for GPU offload, computing an address_of means getting device address directly - no need (and actually incorrect) to insert a runtime call to get the address. This was already working for regions such as `gpu.launch` - but now it applies to acc regions as well.
>From 5d07e94fa737a62ab6c9ef5ac61f62baced7d92c Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <rlupusoru at nvidia.com>
Date: Fri, 9 Jan 2026 11:05:08 -0800
Subject: [PATCH] [flang][cuda] Avoid inserting GetDeviceAddress call in
offload regions
In regions destined for GPU offload, computing an address_of means
getting device address directly - no need (and actually incorrect)
to insert a runtime call to get the address. This was already working
for regions such as `gpu.launch` - but now it applies to acc regions
as well.
---
.../Transforms/CUDA/CUFOpConversion.cpp | 8 ++++--
flang/test/Fir/CUDA/cuda-global-addr.mlir | 27 +++++++++++++++++++
2 files changed, 33 insertions(+), 2 deletions(-)
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
index 424a8fd9d959b..352f8abde6093 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversion.cpp
@@ -29,6 +29,7 @@
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
@@ -49,9 +50,9 @@ namespace {
static bool inDeviceContext(mlir::Operation *op) {
if (op->getParentOfType<cuf::KernelOp>())
return true;
- if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
+ if (op->getParentOfType<mlir::acc::OffloadRegionOpInterface>())
return true;
- if (auto funcOp = op->getParentOfType<mlir::gpu::LaunchOp>())
+ if (auto funcOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
return true;
if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
if (auto cudaProcAttr =
@@ -128,6 +129,9 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
if (op.getResult().getUsers().empty())
return success();
if (auto addrOfOp = op.getMemref().getDefiningOp<fir::AddrOfOp>()) {
+ if (inDeviceContext(addrOfOp)) {
+ return failure();
+ }
if (auto global = symTab.lookup<fir::GlobalOp>(
addrOfOp.getSymbol().getRootReference().getValue())) {
if (cuf::isRegisteredDeviceGlobal(global)) {
diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir
index 6f7816c9163cb..ae88af3d3c16c 100644
--- a/flang/test/Fir/CUDA/cuda-global-addr.mlir
+++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir
@@ -94,6 +94,33 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
// -----
+// Check that we do not introduce call to _FortranACUFGetDeviceAddress when the
+// address_of is inside an acc.parallel region (OffloadRegionOpInterface).
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+fir.global @_QMmod1Eadev_acc {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {
+ %0 = fir.zero_bits !fir.array<10xi32>
+ fir.has_value %0 : !fir.array<10xi32>
+}
+func.func @_QQmain_acc() attributes {fir.bindc_name = "test_acc"} {
+ acc.parallel {
+ %c10 = arith.constant 10 : index
+ %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+ %3 = fir.address_of(@_QMmod1Eadev_acc) : !fir.ref<!fir.array<10xi32>>
+ %4 = fir.declare %3(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Eadev_acc"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<10xi32>>
+ acc.yield
+ }
+ return
+}
+
+// CHECK-LABEL: func.func @_QQmain_acc()
+// CHECK: acc.parallel
+// CHECK-NOT: fir.call {{.*}}GetDeviceAddress
+
+}
+
+// -----
+
// Check that we do not introduce call to _FortranACUFGetDeviceAddress when the
// value has no user.
More information about the flang-commits
mailing list