[flang-commits] [flang] fb18d57 - [flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortran device arrays (#185984)
via flang-commits
flang-commits at lists.llvm.org
Fri Mar 13 09:18:20 PDT 2026
Author: Zhen Wang
Date: 2026-03-13T16:18:14Z
New Revision: fb18d570b0466ca2a401aba11d6e58b206aebc1a
URL: https://github.com/llvm/llvm-project/commit/fb18d570b0466ca2a401aba11d6e58b206aebc1a
DIFF: https://github.com/llvm/llvm-project/commit/fb18d570b0466ca2a401aba11d6e58b206aebc1a.diff
LOG: [flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortran device arrays (#185984)
When CUDA Fortran device arrays are listed in an OpenMP private clause,
the compiler previously allocated private copies on the host heap using
fir.allocmem. This caused device-side operations to receive host
pointers instead of device pointers, leading to cudaErrorIllegalAddress
(700).
Fix by detecting symbols with a CUDA data attribute (device, managed,
unified, etc.) during privatization and using cuf.alloc / cuf.free
instead of fir.allocmem / fir.freemem, so the private copies reside in
device memory.
Added:
flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
Modified:
flang/include/flang/Lower/CUDA.h
flang/lib/Lower/CUDA.cpp
flang/lib/Lower/ConvertVariable.cpp
flang/lib/Lower/Support/PrivateReductionUtils.cpp
Removed:
################################################################################
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 865a7c6a6fa78..83523c21380d6 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -62,6 +62,14 @@ cuf::DataAttributeAttr
translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
const Fortran::semantics::Symbol &sym);
+/// Create a cuf.alloc operation with extents and length parameters elided
+/// when they are already encoded in the static type.
+mlir::Value genCUFAlloc(fir::FirOpBuilder &builder, mlir::Location loc,
+ mlir::Type type, llvm::StringRef uniqName,
+ llvm::StringRef bindcName,
+ cuf::DataAttributeAttr dataAttr,
+ mlir::ValueRange lenParams, mlir::ValueRange extents);
+
/// Check if the rhs has an implicit conversion. Return the elemental op if
/// there is a conversion. Return null otherwise.
std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index d8e2d829f9adf..8fd54bc6cfe2d 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -68,6 +68,24 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
return cuf::getDataAttribute(mlirContext, cudaAttr);
}
+mlir::Value Fortran::lower::genCUFAlloc(fir::FirOpBuilder &builder,
+ mlir::Location loc, mlir::Type type,
+ llvm::StringRef uniqName,
+ llvm::StringRef bindcName,
+ cuf::DataAttributeAttr dataAttr,
+ mlir::ValueRange lenParams,
+ mlir::ValueRange extents) {
+ llvm::SmallVector<mlir::Value> elidedExtents =
+ fir::factory::elideExtentsAlreadyInType(type, extents);
+ llvm::SmallVector<mlir::Value> elidedLenParams =
+ fir::factory::elideLengthsAlreadyInType(type, lenParams);
+ auto idxTy = builder.getIndexType();
+ for (mlir::Value &ext : elidedExtents)
+ ext = builder.createConvert(loc, idxTy, ext);
+ return cuf::AllocOp::create(builder, loc, type, uniqName, bindcName, dataAttr,
+ elidedLenParams, elidedExtents);
+}
+
std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
auto isCopyElementalOp = [](hlfir::ElementalOp elOp) {
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 0ededb364bfea..ec406c9997de2 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -760,21 +760,20 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
cuf::DataAttributeAttr dataAttr =
Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
ultimateSymbol);
- llvm::SmallVector<mlir::Value> indices;
- llvm::SmallVector<mlir::Value> elidedShape =
- fir::factory::elideExtentsAlreadyInType(ty, shape);
- llvm::SmallVector<mlir::Value> elidedLenParams =
- fir::factory::elideLengthsAlreadyInType(ty, lenParams);
- auto idxTy = builder.getIndexType();
- for (mlir::Value sh : elidedShape)
- indices.push_back(builder.createConvert(loc, idxTy, sh));
- if (dataAttr.getValue() == cuf::DataAttribute::Shared)
+ if (dataAttr.getValue() == cuf::DataAttribute::Shared) {
+ llvm::SmallVector<mlir::Value> elidedShape =
+ fir::factory::elideExtentsAlreadyInType(ty, shape);
+ auto idxTy = builder.getIndexType();
+ llvm::SmallVector<mlir::Value> indices;
+ for (mlir::Value sh : elidedShape)
+ indices.push_back(builder.createConvert(loc, idxTy, sh));
return cuf::SharedMemoryOp::create(builder, loc, ty, nm, symNm, lenParams,
indices);
+ }
if (!cuf::isCUDADeviceContext(builder.getRegion()))
- return cuf::AllocOp::create(builder, loc, ty, nm, symNm, dataAttr,
- lenParams, indices);
+ return Fortran::lower::genCUFAlloc(builder, loc, ty, nm, symNm, dataAttr,
+ lenParams, shape);
}
// Let the builder do all the heavy lifting.
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d1a965d288cad..d879a0b7e97aa 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -14,6 +14,7 @@
#include "flang/Lower/AbstractConverter.h"
#include "flang/Lower/Allocatable.h"
+#include "flang/Lower/CUDA.h"
#include "flang/Lower/ConvertVariable.h"
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/Character.h"
@@ -21,12 +22,14 @@
#include "flang/Optimizer/Builder/HLFIRTools.h"
#include "flang/Optimizer/Builder/Runtime/Derived.h"
#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
#include "flang/Optimizer/HLFIR/HLFIROps.h"
#include "flang/Optimizer/Support/FatalError.h"
#include "flang/Semantics/symbol.h"
+#include "flang/Semantics/tools.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/Location.h"
@@ -39,11 +42,11 @@ static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
return false;
}
-static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
- mlir::Location loc, mlir::Type argType,
- mlir::Region &cleanupRegion,
- const Fortran::semantics::Symbol *sym,
- bool isDoConcurrent) {
+static void createCleanupRegion(
+ Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+ mlir::Type argType, mlir::Region &cleanupRegion,
+ const Fortran::semantics::Symbol *sym, bool isDoConcurrent,
+ std::optional<cuf::DataAttributeAttr> cudaDataAttr = std::nullopt) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
assert(cleanupRegion.empty());
mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
@@ -102,9 +105,14 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
fir::IfOp::create(builder, loc, isAllocated, /*withElseRegion=*/false);
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
- mlir::Value cast = builder.createConvert(
- loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
- fir::FreeMemOp::create(builder, loc, cast);
+ if (cudaDataAttr) {
+ cuf::FreeOp::create(builder, loc, addr, *cudaDataAttr);
+ } else {
+ mlir::Value cast = builder.createConvert(
+ loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())),
+ addr);
+ fir::FreeMemOp::create(builder, loc, cast);
+ }
builder.setInsertionPointAfter(ifOp);
if (isDoConcurrent)
@@ -530,6 +538,31 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
if (shouldAllocateTempOnStack())
return createStackTempFromMold(loc, builder, source);
+ // For CUDA device arrays that require special allocation (device,
+ // managed, unified, etc.), use cuf.alloc instead of fir.allocmem so
+ // that the private copy lives in device memory.
+ if (sym && Fortran::semantics::NeedCUDAAlloc(sym->GetUltimate())) {
+ cuf::DataAttributeAttr dataAttr =
+ Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+ sym->GetUltimate());
+ mlir::Type sequenceType =
+ hlfir::getFortranElementOrSequenceType(source.getType());
+ mlir::Value shape = hlfir::genShape(loc, builder, source);
+ auto extents = hlfir::getIndexExtents(loc, builder, shape);
+ mlir::Value alloc = Fortran::lower::genCUFAlloc(
+ builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
+ dataAttr, lenParams, extents);
+ auto declareOp = hlfir::DeclareOp::create(
+ builder, loc, alloc, ".tmp", shape, lenParams,
+ /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
+ fir::FortranVariableFlagsAttr{}, dataAttr);
+ hlfir::Entity temp{declareOp.getBase()};
+ mlir::OpBuilder::InsertionGuard guard(builder);
+ createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
+ isDoConcurrent, dataAttr);
+ return temp;
+ }
+
auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
// if needsDealloc, add cleanup region. Always
// do this for allocatable boxes because they might have been re-allocated
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
new file mode 100644
index 0000000000000..c62f1c4173145
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
@@ -0,0 +1,31 @@
+! Test that OpenMP privatization of CUDA Fortran device arrays uses cuf.alloc
+! instead of fir.allocmem so the private copy resides in device memory.
+
+! RUN: bbc -emit-hlfir -fcuda -fopenmp %s -o - | FileCheck %s
+
+subroutine omp_private_device_array()
+ implicit none
+ integer(4), device :: a(8)
+
+ !$omp parallel private(a)
+ a(1) = 42
+ !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER:.*]] : !fir.box<!fir.array<8xi32>> init {
+
+! CHECK-NEXT: ^bb0(%[[MOLD:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>, %[[PRIV:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
+! CHECK-NEXT: %[[C8:.*]] = arith.constant 8 : index
+! CHECK-NEXT: %[[SHAPE:.*]] = fir.shape %[[C8]]
+! CHECK-NEXT: %[[ALLOC:.*]] = cuf.alloc !fir.array<8xi32> {bindc_name = ".tmp", data_attr = #cuf.cuda<device>}
+! CHECK-NEXT: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {data_attr = #cuf.cuda<device>, uniq_name = ".tmp"}
+! CHECK: fir.embox
+! CHECK: fir.store
+! CHECK-NEXT: omp.yield
+
+! CHECK: } dealloc {
+! CHECK-NEXT: ^bb0(%[[DEALLOC_ARG:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
+! CHECK: cuf.free %{{.*}} {data_attr = #cuf.cuda<device>}
+! CHECK: omp.yield
+! CHECK-NEXT: }
More information about the flang-commits
mailing list