[flang] [llvm] Reland "[flang][cuda] Add support for derived-type initialization on device #172568" (PR #172913)

Thu Dec 18 14:01:43 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

#172568

---

Patch is 84.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172913.diff


32 Files Affected:

- (modified) flang-rt/include/flang-rt/runtime/derived.h (+9-1) 
- (modified) flang-rt/include/flang-rt/runtime/work-queue.h (+17-7) 
- (modified) flang-rt/lib/cuda/allocatable.cpp (+6-6) 
- (modified) flang-rt/lib/cuda/memmove-function.cpp (+18) 
- (modified) flang-rt/lib/cuda/pointer.cpp (+6-6) 
- (modified) flang-rt/lib/runtime/allocatable.cpp (+3-3) 
- (modified) flang-rt/lib/runtime/derived.cpp (+24-6) 
- (modified) flang-rt/lib/runtime/pointer.cpp (+4-2) 
- (modified) flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h (+27) 
- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+2-2) 
- (modified) flang/include/flang/Runtime/CUDA/memmove-function.h (+6) 
- (modified) flang/include/flang/Runtime/CUDA/pointer.h (+2-2) 
- (modified) flang/include/flang/Runtime/allocatable.h (+9-1) 
- (modified) flang/include/flang/Runtime/freestanding-tools.h (+7-1) 
- (modified) flang/include/flang/Runtime/pointer.h (+8-1) 
- (modified) flang/lib/Lower/Allocatable.cpp (+1-9) 
- (modified) flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp (+4-3) 
- (modified) flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp (+6-1) 
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+6-6) 
- (removed) flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf (-15) 
- (modified) flang/test/Lower/Intrinsics/c_loc.f90 (+1-1) 
- (modified) flang/test/Lower/OpenACC/acc-declare.f90 (+2-2) 
- (modified) flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 (+1-1) 
- (modified) flang/test/Lower/allocatable-polymorphic.f90 (+17-17) 
- (modified) flang/test/Lower/allocatable-runtime.f90 (+2-2) 
- (modified) flang/test/Lower/allocate-mold.f90 (+2-2) 
- (modified) flang/test/Lower/assign-statement.f90 (+1-1) 
- (modified) flang/test/Lower/nullify-polymorphic.f90 (+1-1) 
- (modified) flang/test/Lower/polymorphic.f90 (+1-1) 
- (modified) flang/test/Lower/volatile-allocatable.f90 (+9-9) 
- (modified) flang/test/Transforms/lower-repack-arrays.fir (+4-4) 


``````````diff

diff --git a/flang-rt/include/flang-rt/runtime/derived.h b/flang-rt/include/flang-rt/runtime/derived.h
index ac6962c57168c..20d022df56170 100644
--- a/flang-rt/include/flang-rt/runtime/derived.h
+++ b/flang-rt/include/flang-rt/runtime/derived.h
@@ -12,6 +12,7 @@
 #define FLANG_RT_RUNTIME_DERIVED_H_
 
 #include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime::typeInfo {
 class DerivedType;
@@ -23,8 +24,15 @@ class Terminator;
 
 // Perform default component initialization, allocate automatic components.
 // Returns a STAT= code (0 when all's well).
+#ifdef RT_DEVICE_COMPILATION
 RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &,
-    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr);
+    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr,
+    MemcpyFct memcpyFct = &MemcpyWrapper);
+#else
+RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &,
+    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr,
+    MemcpyFct memcpyFct = &Fortran::runtime::memcpy);
+#endif
 
 // Initializes an object clone from the original object.
 // Each allocatable member of the clone is allocated with the same bounds as
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
index 7d7f8ad991a57..54a7457741356 100644
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -249,12 +249,15 @@ class ElementsOverComponents : public Elementwise, public Componentwise {
 class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
                          private ElementsOverComponents {
 public:
-  RT_API_ATTRS InitializeTicket(
-      const Descriptor &instance, const typeInfo::DerivedType &derived)
+  RT_API_ATTRS InitializeTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, MemcpyFct memcpyFct)
       : ImmediateTicketRunner<InitializeTicket>{*this},
-        ElementsOverComponents{instance, derived} {}
+        ElementsOverComponents{instance, derived}, memcpyFct_{memcpyFct} {}
   RT_API_ATTRS int Begin(WorkQueue &);
   RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  MemcpyFct memcpyFct_;
 };
 
 // Initializes one derived type instance from the value of another
@@ -448,12 +451,19 @@ class WorkQueue {
 
   // APIs for particular tasks.  These can return StatOk if the work is
   // completed immediately.
-  RT_API_ATTRS int BeginInitialize(
-      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+#ifdef RT_DEVICE_COMPILATION
+  RT_API_ATTRS int BeginInitialize(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived,
+      MemcpyFct memcpyFct = &MemcpyWrapper) {
+#else
+  RT_API_ATTRS int BeginInitialize(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived,
+      MemcpyFct memcpyFct = &Fortran::runtime::memcpy) {
+#endif
     if (runTicketsImmediately_) {
-      return InitializeTicket{descriptor, derived}.Run(*this);
+      return InitializeTicket{descriptor, derived, memcpyFct}.Run(*this);
     } else {
-      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived, memcpyFct);
       return StatContinue;
     }
   }
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 662703dfb6321..0a7828f8016d5 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -25,9 +25,9 @@ RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const char *sourceFile, int sourceLine, bool deviceInit) {
+  int stat{RTNAME(CUFAllocatableAllocate)(desc, stream, pinned, hasStat, errMsg,
+      sourceFile, sourceLine, deviceInit)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -43,10 +43,10 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
 
 int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const char *sourceFile, int sourceLine, bool deviceInit) {
   // Perform the standard allocation.
-  int stat{RTNAME(AllocatableAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(AllocatableAllocate)(desc, stream, hasStat, errMsg,
+      sourceFile, sourceLine, deviceInit ? &MemcpyHostToDevice : nullptr)};
   if (pinned) {
     // Set pinned according to stat. More infrastructre is needed to set it
     // closer to the actual allocation call.
diff --git a/flang-rt/lib/cuda/memmove-function.cpp b/flang-rt/lib/cuda/memmove-function.cpp
index a7eb0cf1a3e7a..8ebc1250a6687 100644
--- a/flang-rt/lib/cuda/memmove-function.cpp
+++ b/flang-rt/lib/cuda/memmove-function.cpp
@@ -32,4 +32,22 @@ void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count) {
   return dst;
 }
 
+void *MemcpyHostToDevice(void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+  return dst;
+}
+
+void *MemcpyDeviceToHost(void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+  return dst;
+}
+
+void *MemcpyDeviceToDevice(void *dst, const void *src, std::size_t count) {
+  // TODO: Use cudaMemcpyAsync when we have support for stream.
+  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
+  return dst;
+}
+
 } // namespace Fortran::runtime::cuda
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index f07b1a9b60924..bc990c5d27e21 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -24,10 +24,10 @@ RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
     bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    int sourceLine, bool deviceInit) {
   // Perform the standard allocation.
-  int stat{
-      RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile,
+      sourceLine, deviceInit ? &MemcpyHostToDevice : nullptr)};
   if (pinned) {
     // Set pinned according to stat. More infrastructre is needed to set it
     // closer to the actual allocation call.
@@ -38,9 +38,9 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
 
 int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const char *sourceFile, int sourceLine, bool deviceInit) {
+  int stat{RTNAME(CUFPointerAllocate)(desc, stream, pinned, hasStat, errMsg,
+      sourceFile, sourceLine, deviceInit)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index f724f0a20884b..5b3db1e47238b 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -135,7 +135,7 @@ void RTDEF(AllocatableApplyMold)(
 
 int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
     std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const char *sourceFile, int sourceLine, MemcpyFct memcpyFct) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
@@ -148,8 +148,8 @@ int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
           if (!derived->noInitializationNeeded()) {
-            stat =
-                Initialize(descriptor, *derived, terminator, hasStat, errMsg);
+            stat = Initialize(
+                descriptor, *derived, terminator, hasStat, errMsg, memcpyFct);
           }
         }
       }
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 7e50674631624..7fc426b9efc9a 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -13,6 +13,7 @@
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
 #include "flang-rt/runtime/work-queue.h"
+#include "flang/Runtime/CUDA/memmove-function.h"
 
 namespace Fortran::runtime {
 
@@ -32,9 +33,9 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
     const typeInfo::DerivedType &derived, Terminator &terminator, bool,
-    const Descriptor *) {
+    const Descriptor *, MemcpyFct memcpyFct) {
   WorkQueue workQueue{terminator};
-  int status{workQueue.BeginInitialize(instance, derived)};
+  int status{workQueue.BeginInitialize(instance, derived, memcpyFct)};
   return status == StatContinue ? workQueue.Run() : status;
 }
 
@@ -72,7 +73,11 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
       std::size_t bytes{component_->SizeInBytes(instance_)};
-      runtime::memcpy(rawComponent, init, bytes);
+      if (memcpyFct_) {
+        memcpyFct_(rawComponent, init, bytes);
+      } else {
+        Fortran::runtime::memcpy(rawComponent, init, bytes);
+      }
     } else if (component_->genre() == typeInfo::Component::Genre::Pointer ||
         component_->genre() == typeInfo::Component::Genre::PointerDevice) {
       // Data pointers without explicit initialization are established
@@ -110,20 +115,33 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
             chunk = done;
           }
           char *uninitialized{rawInstance + done * *stride};
-          runtime::memcpy(uninitialized, rawInstance, chunk * *stride);
+          if (memcpyFct_) {
+            memcpyFct_(uninitialized, rawInstance, chunk * *stride);
+          } else {
+            Fortran::runtime::memcpy(
+                uninitialized, rawInstance, chunk * *stride);
+          }
           done += chunk;
         }
       } else {
         for (std::size_t done{1}; done < elements_; ++done) {
           char *uninitialized{rawInstance + done * *stride};
-          runtime::memcpy(uninitialized, rawInstance, elementBytes);
+          if (memcpyFct_) {
+            memcpyFct_(uninitialized, rawInstance, elementBytes);
+          } else {
+            Fortran::runtime::memcpy(uninitialized, rawInstance, elementBytes);
+          }
         }
       }
     } else { // one at a time with subscription
       for (Elementwise::Advance(); !Elementwise::IsComplete();
           Elementwise::Advance()) {
         char *element{instance_.Element<char>(subscripts_)};
-        runtime::memcpy(element, rawInstance, elementBytes);
+        if (memcpyFct_) {
+          memcpyFct_(element, rawInstance, elementBytes);
+        } else {
+          Fortran::runtime::memcpy(element, rawInstance, elementBytes);
+        }
       }
     }
   }
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index f8ada65541a1a..0832b5656f1ab 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -157,7 +157,8 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
 }
 
 int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+    MemcpyFct memcpyFct) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
@@ -179,7 +180,8 @@ int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
   if (const DescriptorAddendum * addendum{pointer.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
       if (!derived->noInitializationNeeded()) {
-        stat = Initialize(pointer, *derived, terminator, hasStat, errMsg);
+        stat = Initialize(
+            pointer, *derived, terminator, hasStat, errMsg, memcpyFct);
       }
     }
   }
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 98d7de81c7f08..960405ee0006f 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -252,6 +252,33 @@ constexpr TypeBuilderFunc getModel<void (*)(int)>() {
   };
 }
 template <>
+constexpr TypeBuilderFunc
+getModel<void *(*)(void *, const void *, unsigned long)>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    auto voidPtrTy =
+        fir::LLVMPointerType::get(context, mlir::IntegerType::get(context, 8));
+    auto unsignedLongTy =
+        mlir::IntegerType::get(context, 8 * sizeof(unsigned long));
+    auto funcTy = mlir::FunctionType::get(
+        context, {voidPtrTy, voidPtrTy, unsignedLongTy}, {voidPtrTy});
+    return fir::LLVMPointerType::get(context, funcTy);
+  };
+}
+#ifdef _MSC_VER
+template <>
+constexpr TypeBuilderFunc
+getModel<void *(*)(void *, const void *, unsigned __int64)>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    auto voidPtrTy =
+        fir::LLVMPointerType::get(context, mlir::IntegerType::get(context, 8));
+    auto uint64Ty = mlir::IntegerType::get(context, 64);
+    auto funcTy = mlir::FunctionType::get(
+        context, {voidPtrTy, voidPtrTy, uint64Ty}, {voidPtrTy});
+    return fir::LLVMPointerType::get(context, funcTy);
+  };
+}
+#endif
+template <>
 constexpr TypeBuilderFunc getModel<void **>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return fir::ReferenceType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 97f24bc34bfb8..d5a649594ae92 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -20,14 +20,14 @@ extern "C" {
 int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
diff --git a/flang/include/flang/Runtime/CUDA/memmove-function.h b/flang/include/flang/Runtime/CUDA/memmove-function.h
index 74d6a05eff4c9..765600db4b620 100644
--- a/flang/include/flang/Runtime/CUDA/memmove-function.h
+++ b/flang/include/flang/Runtime/CUDA/memmove-function.h
@@ -19,5 +19,11 @@ void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count);
 
 void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count);
 
+void *MemcpyHostToDevice(void *dst, const void *src, std::size_t count);
+
+void *MemcpyDeviceToHost(void *dst, const void *src, std::size_t count);
+
+void *MemcpyDeviceToDevice(void *dst, const void *src, std::size_t count);
+
 } // namespace Fortran::runtime::cuda
 #endif // FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index b845fd59114d4..4e49691d127e1 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -20,14 +20,14 @@ extern "C" {
 int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 863c07494e7c3..ba065331e3922 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -13,6 +13,7 @@
 
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
+#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime {
 
@@ -94,10 +95,17 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
+#ifdef RT_DEVICE_COMPILATION
 int RTDECL(AllocatableAllocate)(Descriptor &,
     std::int64_t *asyncObject = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, MemcpyFct memcpyFct = &MemcpyWrapper);
+#else
+int RTDECL(AllocatableAllocate)(Descriptor &,
+    std::int64_t *asyncObject = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0, MemcpyFct memcpyFct = &Fortran::runtime::memcpy);
+#endif
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/freestanding-tools.h b/flang/include/flang/Runtime/freestanding-tools.h
index 7ef7cc74f213b..7ab06145c5d52 100644
--- a/flang/include/flang/Runtime/freestanding-tools.h
+++ b/flang/include/flang/Runtime/freestanding-tools.h
@@ -122,7 +122,7 @@ static inline RT_API_ATTRS void memcpy(
   __builtin_memcpy(dest, src, count);
 }
 #elif STD_MEMCPY_UNSUPPORTED
-static inline RT_API_ATTRS void memcpy(
+static inline RT_API_ATTRS void *memcpy(
     void *dest, const void *src, std::size_t count) {
   char *to{reinterpret_cast<char *>(dest)};
   const char *from{reinterpret_cast<const char *>(src)};
@@ -132,6 +132,7 @@ static inline RT_API_ATTRS void memcpy(
   while (count--) {
     *to++ = *from++;
   }
+  return dest;
 }
 #else
 using std::memcpy;
@@ -173,12 +174,17 @@ using std::memmove;
 #endif // !STD_MEMMOVE_UNSUPPORTED
 
 using MemmoveFct = void *(*)(void *, const void *, std::size_t);
+using MemcpyFct = void *(*)(void *, const void *, std::size_t);
 
 #ifdef RT_DEVICE_COMPILATION
 [[maybe_unused]] static RT_API_ATTRS void *MemmoveWrapper(
     void *dest, const void *src, std::size_t count) {
   return Fortran::runtime::memmove(dest, src, count);
 }
+[[maybe_unused]] static RT_API_ATTRS void *MemcpyWrapper(
+    void *dest, const void *src, std::size_t count) {
+  return Fortran::runtime::memcpy(dest, src, count);
+}
 #endif
 
 #if STD_STRLEN_USE_BUILT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/172913