[flang] [llvm] [flang][cuda] Pass allocator index to allocate functions (PR #157189)

Mon Sep 8 09:11:58 PDT 2025

https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/157189

>From 163d9dc781f331d8f06b097470fbbaf46f61c897 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 5 Sep 2025 14:44:00 -0700
Subject: [PATCH 1/4] [flang][cuda] Pass allocator index to allocate functions

---
 flang-rt/lib/cuda/allocatable.cpp             | 25 +++++++-------
 flang-rt/lib/cuda/pointer.cpp                 | 33 +++++++++++--------
 flang-rt/lib/runtime/CMakeLists.txt           |  2 +-
 flang/include/flang/Lower/CUDA.h              | 16 +--------
 .../Dialect/CUF/Attributes/CUFAttr.h          |  2 ++
 .../include/flang/Runtime/CUDA/allocatable.h  | 22 +++++++------
 flang/include/flang/Runtime/CUDA/pointer.h    | 22 +++++++------
 flang/lib/Lower/CUDA.cpp                      | 17 ++++++++++
 flang/lib/Lower/ConvertVariable.cpp           | 18 ++--------
 .../Dialect/CUF/Attributes/CUFAttr.cpp        | 13 ++++++++
 .../Optimizer/Transforms/CUFOpConversion.cpp  | 12 ++++---
 flang/test/Fir/CUDA/cuda-allocate.fir         | 14 ++++----
 12 files changed, 108 insertions(+), 88 deletions(-)

diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index ff1a225d66ce9..483b54061036d 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -23,11 +23,11 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -41,9 +41,12 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
   return stat;
 }
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+  desc.SetAllocIdx(allocIdx);
+#endif
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
       desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
@@ -56,10 +59,10 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocate)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -69,10 +72,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index d3f5cfe8e96a1..3e450596e0f12 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -22,9 +22,12 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+  desc.SetAllocIdx(allocIdx);
+#endif
   // Perform the standard allocation.
   int stat{
       RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
@@ -36,11 +39,11 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
   return stat;
 }
 
-int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -55,10 +58,11 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
 }
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocate)(
-      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocate)(pointer, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +72,11 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocateSync)(
-      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocateSync)(pointer, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..bd4eca52d6e29 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
 elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
   # findloc.cpp has some issues with higher compute capability. Remove it
   # from CUDA build until we can lower its memory footprint.
-  list(REMOVE_ITEM supported_sources findloc.cpp)
+  
   set(sources ${supported_sources})
 else ()
   set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 4a831fd502af4..0a085f47327f2 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -31,21 +31,7 @@ namespace Fortran::lower {
 
 class AbstractConverter;
 
-static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
-  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
-      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
-  if (cudaAttr) {
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
-      return kPinnedAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
-      return kDeviceAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
-      return kManagedAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
-      return kUnifiedAllocatorPos;
-  }
-  return kDefaultAllocator;
-}
+unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym);
 
 void initializeDeviceComponentAllocator(
     Fortran::lower::AbstractConverter &converter,
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
index 4a250d1cc6c54..c00f9e718ad18 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
@@ -112,6 +112,8 @@ cuf::DataAttributeAttr getDataAttr(mlir::Operation *op);
 /// Returns true if the operation has a data attribute with the given value.
 bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value);
 
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 6c97afa9e10e8..43b45cff9a1f5 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index bdfc3268e0814..64698370534ce 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocate)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocateSync)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 } // extern "C"
 
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 1293d2c5bd3ae..5bb0a11e4fa56 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -165,3 +165,20 @@ bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
       return true;
   return false;
 }
+
+unsigned
+Fortran::lower::getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
+  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
+      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
+  if (cudaAttr) {
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
+      return kPinnedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
+      return kDeviceAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
+      return kManagedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
+      return kUnifiedAllocatorPos;
+  }
+  return kDefaultAllocator;
+}
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 80af7f4c1aaad..6e9518a0f3349 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -478,20 +478,6 @@ createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
   builder.restoreInsertionPoint(insertPt);
 }
 
-static unsigned getAllocatorIdxFromDataAttr(cuf::DataAttributeAttr dataAttr) {
-  if (dataAttr) {
-    if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
-      return kPinnedAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Device)
-      return kDeviceAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Managed)
-      return kManagedAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Unified)
-      return kUnifiedAllocatorPos;
-  }
-  return kDefaultAllocator;
-}
-
 /// Create the global op and its init if it has one
 fir::GlobalOp Fortran::lower::defineGlobal(
     Fortran::lower::AbstractConverter &converter,
@@ -554,7 +540,9 @@ fir::GlobalOp Fortran::lower::defineGlobal(
         mlir::Value box = fir::factory::createUnallocatedBox(
             b, loc, symTy,
             /*nonDeferredParams=*/{},
-            /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
+            /*typeSourceBox=*/{},
+            dataAttr ? cuf::getAllocatorIdx(dataAttr.getValue())
+                     : kDefaultAllocator);
         fir::HasValueOp::create(b, loc, box);
       });
     }
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
index bd0499f406c18..fd5dd555c04cd 100644
--- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Runtime/allocator-registry-consts.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -52,4 +53,16 @@ bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value) {
   return false;
 }
 
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr) {
+  if (dataAttr == cuf::DataAttribute::Pinned)
+    return kPinnedAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Device)
+    return kDeviceAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Managed)
+    return kManagedAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Unified)
+    return kUnifiedAllocatorPos;
+  return kDefaultAllocator;
+}
+
 } // namespace cuf
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9834b0499b930..9021c5d982321 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -106,7 +106,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
+        builder, loc, op.getSource() ? fTy.getInput(8) : fTy.getInput(7));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -122,6 +122,8 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value allocIdx = builder.createIntegerConstant(
+        loc, builder.getI32Type(), cuf::getAllocatorIdx(op.getDataAttr()));
     mlir::Value pinned =
         op.getPinned()
             ? op.getPinned()
@@ -133,15 +135,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(2));
       args = fir::runtime::createArguments(
-          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
-          hasStat, errmsg, sourceFile, sourceLine);
+          builder, loc, fTy, op.getBox(), op.getSource(), allocIdx, stream,
+          pinned, hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(1));
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, pinned, hasStat, errmsg,
-                                           sourceFile, sourceLine);
+                                           allocIdx, stream, pinned, hasStat,
+                                           errmsg, sourceFile, sourceLine);
     }
   } else {
     args =
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ea7890c9aac52..799d9991dfa83 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -142,7 +142,7 @@ func.func @_QPallocate_source() {
 // CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
 // CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %c2{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
@@ -177,7 +177,7 @@ func.func @_QQallocate_stream() {
 // CHECK-LABEL: func.func @_QQallocate_stream() 
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %c2{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
 func.func @_QPp_alloc() {
@@ -266,6 +266,6 @@ func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
 // CHECK: %[[PINNED:.*]] = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
 // CHECK: %[[DECL_PINNED:.*]] = fir.declare %[[PINNED]] {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
 // CHECK: %[[CONV_PINNED:.*]] = fir.convert %[[DECL_PINNED]] : (!fir.ref<!fir.logical<4>>) -> !fir.ref<i1>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %c1{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i32, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 } // end of module

>From d6a33bc1d5a8af27b56dc4fcaeaab6a3b7fc69c3 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Fri, 5 Sep 2025 15:14:17 -0700
Subject: [PATCH 2/4] clang-format

---
 flang-rt/lib/cuda/allocatable.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 483b54061036d..4862d7feb57c2 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -59,8 +59,9 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
       hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
@@ -72,8 +73,9 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
       hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {

>From 290e0dcca004ba2d65cc0c8a3a6075c54fb3bee2 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Sat, 6 Sep 2025 07:47:20 -0700
Subject: [PATCH 3/4] Restore flang-rt/lib/runtime/CMakeLists.txt

---
 flang-rt/lib/runtime/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index bd4eca52d6e29..6548ec955b2b8 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
 elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
   # findloc.cpp has some issues with higher compute capability. Remove it
   # from CUDA build until we can lower its memory footprint.
-  
+  list(REMOVE_ITEM supported_sources findloc.cpp)
   set(sources ${supported_sources})
 else ()
   set(sources ${supported_sources} ${host_sources} ${f128_sources})

>From 4b17886890f3891ad1e86ac344327666421a470d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Mon, 8 Sep 2025 09:11:44 -0700
Subject: [PATCH 4/4] Only update allocator index when its default

---
 flang-rt/lib/cuda/allocatable.cpp | 4 +++-
 flang-rt/lib/cuda/pointer.cpp     | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 4862d7feb57c2..60a755aeb34eb 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -45,7 +45,9 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
     int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
 #if !defined(RT_DEVICE_COMPILATION)
-  desc.SetAllocIdx(allocIdx);
+  if (desc.GetAllocIdx() == kDefaultAllocator) {
+    desc.SetAllocIdx(allocIdx);
+  }
 #endif
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index 3e450596e0f12..03187f788638d 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -26,7 +26,9 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
     int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
 #if !defined(RT_DEVICE_COMPILATION)
-  desc.SetAllocIdx(allocIdx);
+  if (desc.GetAllocIdx() == kDefaultAllocator) {
+    desc.SetAllocIdx(allocIdx);
+  }
 #endif
   // Perform the standard allocation.
   int stat{