[flang] [llvm] [flang][cuda] Pass allocator index to allocate functions (PR #157189)
Valentin Clement バレンタイン クレメン via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 8 09:11:58 PDT 2025
https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/157189
>From 163d9dc781f331d8f06b097470fbbaf46f61c897 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 5 Sep 2025 14:44:00 -0700
Subject: [PATCH 1/4] [flang][cuda] Pass allocator index to allocate functions
---
flang-rt/lib/cuda/allocatable.cpp | 25 +++++++-------
flang-rt/lib/cuda/pointer.cpp | 33 +++++++++++--------
flang-rt/lib/runtime/CMakeLists.txt | 2 +-
flang/include/flang/Lower/CUDA.h | 16 +--------
.../Dialect/CUF/Attributes/CUFAttr.h | 2 ++
.../include/flang/Runtime/CUDA/allocatable.h | 22 +++++++------
flang/include/flang/Runtime/CUDA/pointer.h | 22 +++++++------
flang/lib/Lower/CUDA.cpp | 17 ++++++++++
flang/lib/Lower/ConvertVariable.cpp | 18 ++--------
.../Dialect/CUF/Attributes/CUFAttr.cpp | 13 ++++++++
.../Optimizer/Transforms/CUFOpConversion.cpp | 12 ++++---
flang/test/Fir/CUDA/cuda-allocate.fir | 14 ++++----
12 files changed, 108 insertions(+), 88 deletions(-)
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index ff1a225d66ce9..483b54061036d 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -23,11 +23,11 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
- bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int32_t allocIdx,
+ int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(
- desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -41,9 +41,12 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
return stat;
}
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
- bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
+ int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+ desc.SetAllocIdx(allocIdx);
+#endif
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
@@ -56,10 +59,10 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
}
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+ const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- int stat{RTNAME(CUFAllocatableAllocate)(
- alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
+ hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
@@ -69,10 +72,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+ const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- int stat{RTNAME(CUFAllocatableAllocateSync)(
- alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
+ hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index d3f5cfe8e96a1..3e450596e0f12 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -22,9 +22,12 @@ namespace Fortran::runtime::cuda {
extern "C" {
RT_EXT_API_GROUP_BEGIN
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
- bool hasStat, const Descriptor *errMsg, const char *sourceFile,
- int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
+ int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
+ const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+ desc.SetAllocIdx(allocIdx);
+#endif
// Perform the standard allocation.
int stat{
RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
@@ -36,11 +39,11 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
return stat;
}
-int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
- bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int32_t allocIdx,
+ int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
int stat{RTNAME(CUFPointerAllocate)(
- desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
#ifndef RT_DEVICE_COMPILATION
// Descriptor synchronization is only done when the allocation is done
// from the host.
@@ -55,10 +58,11 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
}
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
- const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- int stat{RTNAME(CUFPointerAllocate)(
- pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
+ int stat{RTNAME(CUFPointerAllocate)(pointer, allocIdx, stream, pinned,
+ hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +72,11 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
}
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
- const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- int stat{RTNAME(CUFPointerAllocateSync)(
- pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
+ int stat{RTNAME(CUFPointerAllocateSync)(pointer, allocIdx, stream, pinned,
+ hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..bd4eca52d6e29 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
# findloc.cpp has some issues with higher compute capability. Remove it
# from CUDA build until we can lower its memory footprint.
- list(REMOVE_ITEM supported_sources findloc.cpp)
+
set(sources ${supported_sources})
else ()
set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 4a831fd502af4..0a085f47327f2 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -31,21 +31,7 @@ namespace Fortran::lower {
class AbstractConverter;
-static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
- std::optional<Fortran::common::CUDADataAttr> cudaAttr =
- Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
- if (cudaAttr) {
- if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
- return kPinnedAllocatorPos;
- if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
- return kDeviceAllocatorPos;
- if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
- return kManagedAllocatorPos;
- if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
- return kUnifiedAllocatorPos;
- }
- return kDefaultAllocator;
-}
+unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym);
void initializeDeviceComponentAllocator(
Fortran::lower::AbstractConverter &converter,
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
index 4a250d1cc6c54..c00f9e718ad18 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
@@ -112,6 +112,8 @@ cuf::DataAttributeAttr getDataAttr(mlir::Operation *op);
/// Returns true if the operation has a data attribute with the given value.
bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value);
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr);
+
} // namespace cuf
#endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 6c97afa9e10e8..43b45cff9a1f5 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
extern "C" {
/// Perform allocation of the descriptor.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
- bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, int32_t allocIdx,
+ int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
-int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
- bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int32_t allocIdx,
+ int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index bdfc3268e0814..64698370534ce 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
extern "C" {
/// Perform allocation of the descriptor.
-int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
- bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocate)(Descriptor &, int32_t allocIdx,
+ int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary.
-int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
- bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocateSync)(Descriptor &, int32_t allocIdx,
+ int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
int sourceLine = 0);
/// Perform allocation of the descriptor without synchronization. Assign data
/// from source.
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
- const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
- const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
- bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+ bool *pinned = nullptr, bool hasStat = false,
+ const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+ int sourceLine = 0);
} // extern "C"
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 1293d2c5bd3ae..5bb0a11e4fa56 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -165,3 +165,20 @@ bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
return true;
return false;
}
+
+unsigned
+Fortran::lower::getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
+ std::optional<Fortran::common::CUDADataAttr> cudaAttr =
+ Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
+ if (cudaAttr) {
+ if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
+ return kPinnedAllocatorPos;
+ if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
+ return kDeviceAllocatorPos;
+ if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
+ return kManagedAllocatorPos;
+ if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
+ return kUnifiedAllocatorPos;
+ }
+ return kDefaultAllocator;
+}
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 80af7f4c1aaad..6e9518a0f3349 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -478,20 +478,6 @@ createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
builder.restoreInsertionPoint(insertPt);
}
-static unsigned getAllocatorIdxFromDataAttr(cuf::DataAttributeAttr dataAttr) {
- if (dataAttr) {
- if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
- return kPinnedAllocatorPos;
- if (dataAttr.getValue() == cuf::DataAttribute::Device)
- return kDeviceAllocatorPos;
- if (dataAttr.getValue() == cuf::DataAttribute::Managed)
- return kManagedAllocatorPos;
- if (dataAttr.getValue() == cuf::DataAttribute::Unified)
- return kUnifiedAllocatorPos;
- }
- return kDefaultAllocator;
-}
-
/// Create the global op and its init if it has one
fir::GlobalOp Fortran::lower::defineGlobal(
Fortran::lower::AbstractConverter &converter,
@@ -554,7 +540,9 @@ fir::GlobalOp Fortran::lower::defineGlobal(
mlir::Value box = fir::factory::createUnallocatedBox(
b, loc, symTy,
/*nonDeferredParams=*/{},
- /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
+ /*typeSourceBox=*/{},
+ dataAttr ? cuf::getAllocatorIdx(dataAttr.getValue())
+ : kDefaultAllocator);
fir::HasValueOp::create(b, loc, box);
});
}
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
index bd0499f406c18..fd5dd555c04cd 100644
--- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
@@ -12,6 +12,7 @@
#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Runtime/allocator-registry-consts.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/DialectImplementation.h"
@@ -52,4 +53,16 @@ bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value) {
return false;
}
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr) {
+ if (dataAttr == cuf::DataAttribute::Pinned)
+ return kPinnedAllocatorPos;
+ if (dataAttr == cuf::DataAttribute::Device)
+ return kDeviceAllocatorPos;
+ if (dataAttr == cuf::DataAttribute::Managed)
+ return kManagedAllocatorPos;
+ if (dataAttr == cuf::DataAttribute::Unified)
+ return kUnifiedAllocatorPos;
+ return kDefaultAllocator;
+}
+
} // namespace cuf
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9834b0499b930..9021c5d982321 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -106,7 +106,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value sourceLine;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
sourceLine = fir::factory::locationToLineNo(
- builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
+ builder, loc, op.getSource() ? fTy.getInput(8) : fTy.getInput(7));
else
sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
@@ -122,6 +122,8 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
}
llvm::SmallVector<mlir::Value> args;
if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+ mlir::Value allocIdx = builder.createIntegerConstant(
+ loc, builder.getI32Type(), cuf::getAllocatorIdx(op.getDataAttr()));
mlir::Value pinned =
op.getPinned()
? op.getPinned()
@@ -133,15 +135,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
op.getStream() ? op.getStream()
: builder.createNullConstant(loc, fTy.getInput(2));
args = fir::runtime::createArguments(
- builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
- hasStat, errmsg, sourceFile, sourceLine);
+ builder, loc, fTy, op.getBox(), op.getSource(), allocIdx, stream,
+ pinned, hasStat, errmsg, sourceFile, sourceLine);
} else {
mlir::Value stream =
op.getStream() ? op.getStream()
: builder.createNullConstant(loc, fTy.getInput(1));
args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
- stream, pinned, hasStat, errmsg,
- sourceFile, sourceLine);
+ allocIdx, stream, pinned, hasStat,
+ errmsg, sourceFile, sourceLine);
}
} else {
args =
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ea7890c9aac52..799d9991dfa83 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
// CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
// CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
// CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
}
// CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
// CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
// CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
// CHECK: _FortranAAllocatableSetBounds
// CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -177,7 +177,7 @@ func.func @_QQallocate_stream() {
// CHECK-LABEL: func.func @_QQallocate_stream()
// CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
// CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %c2{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
func.func @_QPp_alloc() {
@@ -266,6 +266,6 @@ func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
// CHECK: %[[PINNED:.*]] = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
// CHECK: %[[DECL_PINNED:.*]] = fir.declare %[[PINNED]] {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
// CHECK: %[[CONV_PINNED:.*]] = fir.convert %[[DECL_PINNED]] : (!fir.ref<!fir.logical<4>>) -> !fir.ref<i1>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %c1{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
} // end of module
>From d6a33bc1d5a8af27b56dc4fcaeaab6a3b7fc69c3 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 5 Sep 2025 15:14:17 -0700
Subject: [PATCH 2/4] clang-format
---
flang-rt/lib/cuda/allocatable.cpp | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 483b54061036d..4862d7feb57c2 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -59,8 +59,9 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
}
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
- const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+ const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
@@ -72,8 +73,9 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
- const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+ const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+ bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+ int sourceLine) {
int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
>From 290e0dcca004ba2d65cc0c8a3a6075c54fb3bee2 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Sat, 6 Sep 2025 07:47:20 -0700
Subject: [PATCH 3/4] Restore flang-rt/lib/runtime/CMakeLists.txt
---
flang-rt/lib/runtime/CMakeLists.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index bd4eca52d6e29..6548ec955b2b8 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
# findloc.cpp has some issues with higher compute capability. Remove it
# from CUDA build until we can lower its memory footprint.
-
+ list(REMOVE_ITEM supported_sources findloc.cpp)
set(sources ${supported_sources})
else ()
set(sources ${supported_sources} ${host_sources} ${f128_sources})
>From 4b17886890f3891ad1e86ac344327666421a470d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 8 Sep 2025 09:11:44 -0700
Subject: [PATCH 4/4] Only update allocator index when its default
---
flang-rt/lib/cuda/allocatable.cpp | 4 +++-
flang-rt/lib/cuda/pointer.cpp | 4 +++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 4862d7feb57c2..60a755aeb34eb 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -45,7 +45,9 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
#if !defined(RT_DEVICE_COMPILATION)
- desc.SetAllocIdx(allocIdx);
+ if (desc.GetAllocIdx() == kDefaultAllocator) {
+ desc.SetAllocIdx(allocIdx);
+ }
#endif
// Perform the standard allocation.
int stat{RTNAME(AllocatableAllocate)(
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index 3e450596e0f12..03187f788638d 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -26,7 +26,9 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
const char *sourceFile, int sourceLine) {
#if !defined(RT_DEVICE_COMPILATION)
- desc.SetAllocIdx(allocIdx);
+ if (desc.GetAllocIdx() == kDefaultAllocator) {
+ desc.SetAllocIdx(allocIdx);
+ }
#endif
// Perform the standard allocation.
int stat{
More information about the llvm-commits
mailing list