[flang] [llvm] [flang][cuda] Pass allocator index to allocate functions (PR #157189)

Fri Sep 5 15:12:20 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

Pass the allocator index as part of the allocate function. The information is part of cuf.allocate and it is useful for device resident components. 

---

Patch is 26.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157189.diff


12 Files Affected:

- (modified) flang-rt/lib/cuda/allocatable.cpp (+14-11) 
- (modified) flang-rt/lib/cuda/pointer.cpp (+19-14) 
- (modified) flang-rt/lib/runtime/CMakeLists.txt (+1-1) 
- (modified) flang/include/flang/Lower/CUDA.h (+1-15) 
- (modified) flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h (+2) 
- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+12-10) 
- (modified) flang/include/flang/Runtime/CUDA/pointer.h (+12-10) 
- (modified) flang/lib/Lower/CUDA.cpp (+17) 
- (modified) flang/lib/Lower/ConvertVariable.cpp (+3-15) 
- (modified) flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp (+13) 
- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+7-5) 
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+7-7) 


``````````diff

diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index ff1a225d66ce9..483b54061036d 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -23,11 +23,11 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -41,9 +41,12 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
   return stat;
 }
 
-int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+  desc.SetAllocIdx(allocIdx);
+#endif
   // Perform the standard allocation.
   int stat{RTNAME(AllocatableAllocate)(
       desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
@@ -56,10 +59,10 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
 }
 
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocate)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocate)(alloc, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -69,10 +72,10 @@ int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
 }
 
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned, bool hasStat,
     const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocateSync)(
-      alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(CUFAllocatableAllocateSync)(alloc, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index d3f5cfe8e96a1..3e450596e0f12 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -22,9 +22,12 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 RT_EXT_API_GROUP_BEGIN
 
-int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
-    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+int RTDEF(CUFPointerAllocate)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
+    const char *sourceFile, int sourceLine) {
+#if !defined(RT_DEVICE_COMPILATION)
+  desc.SetAllocIdx(allocIdx);
+#endif
   // Perform the standard allocation.
   int stat{
       RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
@@ -36,11 +39,11 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
   return stat;
 }
 
-int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
-    bool *pinned, bool hasStat, const Descriptor *errMsg,
+int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int32_t allocIdx,
+    int64_t *stream, bool *pinned, bool hasStat, const Descriptor *errMsg,
     const char *sourceFile, int sourceLine) {
   int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+      desc, allocIdx, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -55,10 +58,11 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
 }
 
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocate)(
-      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocate)(pointer, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
@@ -68,10 +72,11 @@ int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
 }
 
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocateSync)(
-      pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const Descriptor &source, int32_t allocIdx, int64_t *stream, bool *pinned,
+    bool hasStat, const Descriptor *errMsg, const char *sourceFile,
+    int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocateSync)(pointer, allocIdx, stream, pinned,
+      hasStat, errMsg, sourceFile, sourceLine)};
   if (stat == StatOk) {
     Terminator terminator{sourceFile, sourceLine};
     Fortran::runtime::DoFromSourceAssign(
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 6548ec955b2b8..bd4eca52d6e29 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -180,7 +180,7 @@ if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
 elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
   # findloc.cpp has some issues with higher compute capability. Remove it
   # from CUDA build until we can lower its memory footprint.
-  list(REMOVE_ITEM supported_sources findloc.cpp)
+  
   set(sources ${supported_sources})
 else ()
   set(sources ${supported_sources} ${host_sources} ${f128_sources})
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 4a831fd502af4..0a085f47327f2 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -31,21 +31,7 @@ namespace Fortran::lower {
 
 class AbstractConverter;
 
-static inline unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
-  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
-      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
-  if (cudaAttr) {
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
-      return kPinnedAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
-      return kDeviceAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
-      return kManagedAllocatorPos;
-    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
-      return kUnifiedAllocatorPos;
-  }
-  return kDefaultAllocator;
-}
+unsigned getAllocatorIdx(const Fortran::semantics::Symbol &sym);
 
 void initializeDeviceComponentAllocator(
     Fortran::lower::AbstractConverter &converter,
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
index 4a250d1cc6c54..c00f9e718ad18 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
@@ -112,6 +112,8 @@ cuf::DataAttributeAttr getDataAttr(mlir::Operation *op);
 /// Returns true if the operation has a data attribute with the given value.
 bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value);
 
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr);
+
 } // namespace cuf
 
 #endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 6c97afa9e10e8..43b45cff9a1f5 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocate)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform deallocation of the descriptor with synchronization of it when
 /// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index bdfc3268e0814..64698370534ce 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -17,31 +17,33 @@ namespace Fortran::runtime::cuda {
 extern "C" {
 
 /// Perform allocation of the descriptor.
-int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocate)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
-int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
-    bool *pinned = nullptr, bool hasStat = false,
+int RTDECL(CUFPointerAllocateSync)(Descriptor &, int32_t allocIdx,
+    int64_t *stream = nullptr, bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
     int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
 int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary. Assign data from source.
 int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
-    const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
-    bool hasStat = false, const Descriptor *errMsg = nullptr,
-    const char *sourceFile = nullptr, int sourceLine = 0);
+    const Descriptor &source, int32_t allocIdx, int64_t *stream = nullptr,
+    bool *pinned = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0);
 
 } // extern "C"
 
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 1293d2c5bd3ae..5bb0a11e4fa56 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -165,3 +165,20 @@ bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
       return true;
   return false;
 }
+
+unsigned
+Fortran::lower::getAllocatorIdx(const Fortran::semantics::Symbol &sym) {
+  std::optional<Fortran::common::CUDADataAttr> cudaAttr =
+      Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
+  if (cudaAttr) {
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Pinned)
+      return kPinnedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Device)
+      return kDeviceAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Managed)
+      return kManagedAllocatorPos;
+    if (*cudaAttr == Fortran::common::CUDADataAttr::Unified)
+      return kUnifiedAllocatorPos;
+  }
+  return kDefaultAllocator;
+}
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 80af7f4c1aaad..6e9518a0f3349 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -478,20 +478,6 @@ createGlobalInitialization(fir::FirOpBuilder &builder, fir::GlobalOp global,
   builder.restoreInsertionPoint(insertPt);
 }
 
-static unsigned getAllocatorIdxFromDataAttr(cuf::DataAttributeAttr dataAttr) {
-  if (dataAttr) {
-    if (dataAttr.getValue() == cuf::DataAttribute::Pinned)
-      return kPinnedAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Device)
-      return kDeviceAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Managed)
-      return kManagedAllocatorPos;
-    if (dataAttr.getValue() == cuf::DataAttribute::Unified)
-      return kUnifiedAllocatorPos;
-  }
-  return kDefaultAllocator;
-}
-
 /// Create the global op and its init if it has one
 fir::GlobalOp Fortran::lower::defineGlobal(
     Fortran::lower::AbstractConverter &converter,
@@ -554,7 +540,9 @@ fir::GlobalOp Fortran::lower::defineGlobal(
         mlir::Value box = fir::factory::createUnallocatedBox(
             b, loc, symTy,
             /*nonDeferredParams=*/{},
-            /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr));
+            /*typeSourceBox=*/{},
+            dataAttr ? cuf::getAllocatorIdx(dataAttr.getValue())
+                     : kDefaultAllocator);
         fir::HasValueOp::create(b, loc, box);
       });
     }
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
index bd0499f406c18..fd5dd555c04cd 100644
--- a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
@@ -12,6 +12,7 @@
 
 #include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Runtime/allocator-registry-consts.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -52,4 +53,16 @@ bool hasDataAttr(mlir::Operation *op, cuf::DataAttribute value) {
   return false;
 }
 
+unsigned getAllocatorIdx(cuf::DataAttribute dataAttr) {
+  if (dataAttr == cuf::DataAttribute::Pinned)
+    return kPinnedAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Device)
+    return kDeviceAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Managed)
+    return kManagedAllocatorPos;
+  if (dataAttr == cuf::DataAttribute::Unified)
+    return kUnifiedAllocatorPos;
+  return kDefaultAllocator;
+}
+
 } // namespace cuf
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 9834b0499b930..9021c5d982321 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -106,7 +106,7 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   mlir::Value sourceLine;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>)
     sourceLine = fir::factory::locationToLineNo(
-        builder, loc, op.getSource() ? fTy.getInput(7) : fTy.getInput(6));
+        builder, loc, op.getSource() ? fTy.getInput(8) : fTy.getInput(7));
   else
     sourceLine = fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
 
@@ -122,6 +122,8 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
   }
   llvm::SmallVector<mlir::Value> args;
   if constexpr (std::is_same_v<OpTy, cuf::AllocateOp>) {
+    mlir::Value allocIdx = builder.createIntegerConstant(
+        loc, builder.getI32Type(), cuf::getAllocatorIdx(op.getDataAttr()));
     mlir::Value pinned =
         op.getPinned()
             ? op.getPinned()
@@ -133,15 +135,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(2));
       args = fir::runtime::createArguments(
-          builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
-          hasStat, errmsg, sourceFile, sourceLine);
+          builder, loc, fTy, op.getBox(), op.getSource(), allocIdx, stream,
+          pinned, hasStat, errmsg, sourceFile, sourceLine);
     } else {
       mlir::Value stream =
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(1));
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
-                                           stream, pinned, hasStat, errmsg,
-                                           sourceFile, sourceLine);
+                                           allocIdx, stream, pinned, hasStat,
+                                           errmsg, sourceFile, sourceLine);
     }
   } else {
     args =
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ea7890c9aac52..799d9991dfa83 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{....
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/157189