[flang] [llvm] Revert "Reland "[flang][cuda] Add support for derived-type initialization on device #172568" (PR #174033)

Tue Dec 30 15:15:44 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

This fails https://lab.llvm.org/staging/#/builders/65
This reverts commit 1ac1a547ee3b74b4d02bc94faf02ca0381196d11.

---

Patch is 84.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/174033.diff


32 Files Affected:

- (modified) flang-rt/include/flang-rt/runtime/derived.h (+1-9) 
- (modified) flang-rt/include/flang-rt/runtime/work-queue.h (+7-17) 
- (modified) flang-rt/lib/cuda/allocatable.cpp (+6-6) 
- (modified) flang-rt/lib/cuda/memmove-function.cpp (-18) 
- (modified) flang-rt/lib/cuda/pointer.cpp (+6-6) 
- (modified) flang-rt/lib/runtime/allocatable.cpp (+3-3) 
- (modified) flang-rt/lib/runtime/derived.cpp (+6-24) 
- (modified) flang-rt/lib/runtime/pointer.cpp (+2-4) 
- (modified) flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h (-27) 
- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+2-2) 
- (modified) flang/include/flang/Runtime/CUDA/memmove-function.h (-6) 
- (modified) flang/include/flang/Runtime/CUDA/pointer.h (+2-2) 
- (modified) flang/include/flang/Runtime/allocatable.h (+1-9) 
- (modified) flang/include/flang/Runtime/freestanding-tools.h (+1-8) 
- (modified) flang/include/flang/Runtime/pointer.h (+1-8) 
- (modified) flang/lib/Lower/Allocatable.cpp (+9-1) 
- (modified) flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp (+3-4) 
- (modified) flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp (+1-6) 
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+6-6) 
- (added) flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf (+15) 
- (modified) flang/test/Lower/Intrinsics/c_loc.f90 (+1-1) 
- (modified) flang/test/Lower/OpenACC/acc-declare.f90 (+2-2) 
- (modified) flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 (+1-1) 
- (modified) flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 (+1-1) 
- (modified) flang/test/Lower/allocatable-polymorphic.f90 (+17-17) 
- (modified) flang/test/Lower/allocatable-runtime.f90 (+2-2) 
- (modified) flang/test/Lower/allocate-mold.f90 (+2-2) 
- (modified) flang/test/Lower/assign-statement.f90 (+1-1) 
- (modified) flang/test/Lower/nullify-polymorphic.f90 (+1-1) 
- (modified) flang/test/Lower/polymorphic.f90 (+1-1) 
- (modified) flang/test/Lower/volatile-allocatable.f90 (+9-9) 
- (modified) flang/test/Transforms/lower-repack-arrays.fir (+4-4) 


``````````diff

diff --git a/flang-rt/include/flang-rt/runtime/derived.h b/flang-rt/include/flang-rt/runtime/derived.h
index 20d022df56170..ac6962c57168c 100644
--- a/flang-rt/include/flang-rt/runtime/derived.h
+++ b/flang-rt/include/flang-rt/runtime/derived.h
@@ -12,7 +12,6 @@
 #define FLANG_RT_RUNTIME_DERIVED_H_
 
 #include "flang/Common/api-attrs.h"
-#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime::typeInfo {
 class DerivedType;
@@ -24,15 +23,8 @@ class Terminator;
 
 // Perform default component initialization, allocate automatic components.
 // Returns a STAT= code (0 when all's well).
-#ifdef RT_DEVICE_COMPILATION
 RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &,
-    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr,
-    MemcpyFct memcpyFct = &MemcpyWrapper);
-#else
-RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &,
-    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr,
-    MemcpyFct memcpyFct = &Fortran::runtime::memcpy);
-#endif
+    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr);
 
 // Initializes an object clone from the original object.
 // Each allocatable member of the clone is allocated with the same bounds as
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
index 54a7457741356..7d7f8ad991a57 100644
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -249,15 +249,12 @@ class ElementsOverComponents : public Elementwise, public Componentwise {
 class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
                          private ElementsOverComponents {
 public:
-  RT_API_ATTRS InitializeTicket(const Descriptor &instance,
-      const typeInfo::DerivedType &derived, MemcpyFct memcpyFct)
+  RT_API_ATTRS InitializeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
       : ImmediateTicketRunner<InitializeTicket>{*this},
-        ElementsOverComponents{instance, derived}, memcpyFct_{memcpyFct} {}
+        ElementsOverComponents{instance, derived} {}
   RT_API_ATTRS int Begin(WorkQueue &);
   RT_API_ATTRS int Continue(WorkQueue &);
-
-private:
-  MemcpyFct memcpyFct_;
 };
 
 // Initializes one derived type instance from the value of another
@@ -451,19 +448,12 @@ class WorkQueue {
 
   // APIs for particular tasks.  These can return StatOk if the work is
   // completed immediately.
-#ifdef RT_DEVICE_COMPILATION
-  RT_API_ATTRS int BeginInitialize(const Descriptor &descriptor,
-      const typeInfo::DerivedType &derived,
-      MemcpyFct memcpyFct = &MemcpyWrapper) {
-#else
-  RT_API_ATTRS int BeginInitialize(const Descriptor &descriptor,
-      const typeInfo::DerivedType &derived,
-      MemcpyFct memcpyFct = &Fortran::runtime::memcpy) {
-#endif
+  RT_API_ATTRS int BeginInitialize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
     if (runTicketsImmediately_) {
-      return InitializeTicket{descriptor, derived, memcpyFct}.Run(*this);
+      return InitializeTicket{descriptor, derived}.Run(*this);
     } else {
-      StartTicket().u.emplace<InitializeTicket>(descriptor, derived, memcpyFct);
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
       return StatContinue;
     }
   }
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 0a7828f8016d5..662703dfb6321 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -25,9 +25,9 @@ RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine, bool deviceInit) {
-  int stat{RTNAME(CUFAllocatableAllocate)(desc, stream, pinned, hasStat, errMsg,
-      sourceFile, sourceLine, deviceInit)};
+    const char *sourceFile, int sourceLine) {
+  int stat{RTNAME(CUFAllocatableAllocate)(
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -43,10 +43,10 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
 
 int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine, bool deviceInit) {
+    const char *sourceFile, int sourceLine) {
   // Perform the standard allocation.
-  int stat{RTNAME(AllocatableAllocate)(desc, stream, hasStat, errMsg,
-      sourceFile, sourceLine, deviceInit ? &MemcpyHostToDevice : nullptr)};
+  int stat{RTNAME(AllocatableAllocate)(
+      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
   if (pinned) {
     // Set pinned according to stat. More infrastructre is needed to set it
     // closer to the actual allocation call.
diff --git a/flang-rt/lib/cuda/memmove-function.cpp b/flang-rt/lib/cuda/memmove-function.cpp
index 8ebc1250a6687..a7eb0cf1a3e7a 100644
--- a/flang-rt/lib/cuda/memmove-function.cpp
+++ b/flang-rt/lib/cuda/memmove-function.cpp
@@ -32,22 +32,4 @@ void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count) {
   return dst;
 }
 
-void *MemcpyHostToDevice(void *dst, const void *src, std::size_t count) {
-  // TODO: Use cudaMemcpyAsync when we have support for stream.
-  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
-  return dst;
-}
-
-void *MemcpyDeviceToHost(void *dst, const void *src, std::size_t count) {
-  // TODO: Use cudaMemcpyAsync when we have support for stream.
-  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
-  return dst;
-}
-
-void *MemcpyDeviceToDevice(void *dst, const void *src, std::size_t count) {
-  // TODO: Use cudaMemcpyAsync when we have support for stream.
-  CUDA_REPORT_IF_ERROR(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToDevice));
-  return dst;
-}
-
 } // namespace Fortran::runtime::cuda
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index bc990c5d27e21..f07b1a9b60924 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -24,10 +24,10 @@ RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
     bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine, bool deviceInit) {
+    int sourceLine) {
   // Perform the standard allocation.
-  int stat{RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile,
-      sourceLine, deviceInit ? &MemcpyHostToDevice : nullptr)};
+  int stat{
+      RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
   if (pinned) {
     // Set pinned according to stat. More infrastructre is needed to set it
     // closer to the actual allocation call.
@@ -38,9 +38,9 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
 
 int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine, bool deviceInit) {
-  int stat{RTNAME(CUFPointerAllocate)(desc, stream, pinned, hasStat, errMsg,
-      sourceFile, sourceLine, deviceInit)};
+    const char *sourceFile, int sourceLine) {
+  int stat{RTNAME(CUFPointerAllocate)(
+      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index 5b3db1e47238b..f724f0a20884b 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -135,7 +135,7 @@ void RTDEF(AllocatableApplyMold)(
 
 int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
     std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine, MemcpyFct memcpyFct) {
+    const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
@@ -148,8 +148,8 @@ int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
           if (!derived->noInitializationNeeded()) {
-            stat = Initialize(
-                descriptor, *derived, terminator, hasStat, errMsg, memcpyFct);
+            stat =
+                Initialize(descriptor, *derived, terminator, hasStat, errMsg);
           }
         }
       }
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 7fc426b9efc9a..7e50674631624 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -13,7 +13,6 @@
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
 #include "flang-rt/runtime/work-queue.h"
-#include "flang/Runtime/CUDA/memmove-function.h"
 
 namespace Fortran::runtime {
 
@@ -33,9 +32,9 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
     const typeInfo::DerivedType &derived, Terminator &terminator, bool,
-    const Descriptor *, MemcpyFct memcpyFct) {
+    const Descriptor *) {
   WorkQueue workQueue{terminator};
-  int status{workQueue.BeginInitialize(instance, derived, memcpyFct)};
+  int status{workQueue.BeginInitialize(instance, derived)};
   return status == StatContinue ? workQueue.Run() : status;
 }
 
@@ -73,11 +72,7 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
       std::size_t bytes{component_->SizeInBytes(instance_)};
-      if (memcpyFct_) {
-        memcpyFct_(rawComponent, init, bytes);
-      } else {
-        Fortran::runtime::memcpy(rawComponent, init, bytes);
-      }
+      runtime::memcpy(rawComponent, init, bytes);
     } else if (component_->genre() == typeInfo::Component::Genre::Pointer ||
         component_->genre() == typeInfo::Component::Genre::PointerDevice) {
       // Data pointers without explicit initialization are established
@@ -115,33 +110,20 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
             chunk = done;
           }
           char *uninitialized{rawInstance + done * *stride};
-          if (memcpyFct_) {
-            memcpyFct_(uninitialized, rawInstance, chunk * *stride);
-          } else {
-            Fortran::runtime::memcpy(
-                uninitialized, rawInstance, chunk * *stride);
-          }
+          runtime::memcpy(uninitialized, rawInstance, chunk * *stride);
           done += chunk;
         }
       } else {
         for (std::size_t done{1}; done < elements_; ++done) {
           char *uninitialized{rawInstance + done * *stride};
-          if (memcpyFct_) {
-            memcpyFct_(uninitialized, rawInstance, elementBytes);
-          } else {
-            Fortran::runtime::memcpy(uninitialized, rawInstance, elementBytes);
-          }
+          runtime::memcpy(uninitialized, rawInstance, elementBytes);
         }
       }
     } else { // one at a time with subscription
       for (Elementwise::Advance(); !Elementwise::IsComplete();
           Elementwise::Advance()) {
         char *element{instance_.Element<char>(subscripts_)};
-        if (memcpyFct_) {
-          memcpyFct_(element, rawInstance, elementBytes);
-        } else {
-          Fortran::runtime::memcpy(element, rawInstance, elementBytes);
-        }
+        runtime::memcpy(element, rawInstance, elementBytes);
       }
     }
   }
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index 0832b5656f1ab..f8ada65541a1a 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -157,8 +157,7 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
 }
 
 int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
-    MemcpyFct memcpyFct) {
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
@@ -180,8 +179,7 @@ int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
   if (const DescriptorAddendum * addendum{pointer.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
       if (!derived->noInitializationNeeded()) {
-        stat = Initialize(
-            pointer, *derived, terminator, hasStat, errMsg, memcpyFct);
+        stat = Initialize(pointer, *derived, terminator, hasStat, errMsg);
       }
     }
   }
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 960405ee0006f..98d7de81c7f08 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -252,33 +252,6 @@ constexpr TypeBuilderFunc getModel<void (*)(int)>() {
   };
 }
 template <>
-constexpr TypeBuilderFunc
-getModel<void *(*)(void *, const void *, unsigned long)>() {
-  return [](mlir::MLIRContext *context) -> mlir::Type {
-    auto voidPtrTy =
-        fir::LLVMPointerType::get(context, mlir::IntegerType::get(context, 8));
-    auto unsignedLongTy =
-        mlir::IntegerType::get(context, 8 * sizeof(unsigned long));
-    auto funcTy = mlir::FunctionType::get(
-        context, {voidPtrTy, voidPtrTy, unsignedLongTy}, {voidPtrTy});
-    return fir::LLVMPointerType::get(context, funcTy);
-  };
-}
-#ifdef _MSC_VER
-template <>
-constexpr TypeBuilderFunc
-getModel<void *(*)(void *, const void *, unsigned __int64)>() {
-  return [](mlir::MLIRContext *context) -> mlir::Type {
-    auto voidPtrTy =
-        fir::LLVMPointerType::get(context, mlir::IntegerType::get(context, 8));
-    auto uint64Ty = mlir::IntegerType::get(context, 64);
-    auto funcTy = mlir::FunctionType::get(
-        context, {voidPtrTy, voidPtrTy, uint64Ty}, {voidPtrTy});
-    return fir::LLVMPointerType::get(context, funcTy);
-  };
-}
-#endif
-template <>
 constexpr TypeBuilderFunc getModel<void **>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return fir::ReferenceType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index d5a649594ae92..97f24bc34bfb8 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -20,14 +20,14 @@ extern "C" {
 int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, bool deviceInit = false);
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, bool deviceInit = false);
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
diff --git a/flang/include/flang/Runtime/CUDA/memmove-function.h b/flang/include/flang/Runtime/CUDA/memmove-function.h
index 765600db4b620..74d6a05eff4c9 100644
--- a/flang/include/flang/Runtime/CUDA/memmove-function.h
+++ b/flang/include/flang/Runtime/CUDA/memmove-function.h
@@ -19,11 +19,5 @@ void *MemmoveDeviceToHost(void *dst, const void *src, std::size_t count);
 
 void *MemmoveDeviceToDevice(void *dst, const void *src, std::size_t count);
 
-void *MemcpyHostToDevice(void *dst, const void *src, std::size_t count);
-
-void *MemcpyDeviceToHost(void *dst, const void *src, std::size_t count);
-
-void *MemcpyDeviceToDevice(void *dst, const void *src, std::size_t count);
-
 } // namespace Fortran::runtime::cuda
 #endif // FORTRAN_RUNTIME_CUDA_MEMMOVE_FUNCTION_H_
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index 4e49691d127e1..b845fd59114d4 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -20,14 +20,14 @@ extern "C" {
 int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, bool deviceInit = false);
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, bool deviceInit = false);
+    int sourceLine = 0);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index ba065331e3922..863c07494e7c3 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -13,7 +13,6 @@
 
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
-#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime {
 
@@ -95,17 +94,10 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
-#ifdef RT_DEVICE_COMPILATION
 int RTDECL(AllocatableAllocate)(Descriptor &,
     std::int64_t *asyncObject = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, MemcpyFct memcpyFct = &MemcpyWrapper);
-#else
-int RTDECL(AllocatableAllocate)(Descriptor &,
-    std::int64_t *asyncObject = nullptr, bool hasStat = false,
-    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, MemcpyFct memcpyFct = &Fortran::runtime::memcpy);
-#endif
+    int sourceLine = 0);
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/freestanding-tools.h b/flang/include/flang/Runtime/freestanding-tools.h
index 42129328d0b02..7ef7cc74f213b 100644
--- a/flang/include/flang/Runtime/freestanding-tools.h
+++ b/flang/include/flang/Runtime/freestanding-tools.h
@@ -122,7 +122,7 @@ static inline RT_API_ATTRS void memcpy(
   __builtin_memcpy(dest, src, count);
 }
 #elif STD_MEMCPY_UNSUPPORTED
-static inline RT_API_ATTRS void *memcpy(
+static inline RT_API_ATTRS void memcpy(
     void *dest, const void *src, std::size_t count) {
   char *to{reinterpret_cast<char *>(dest)};
   const char *from{reinterpret_cast<const char *>(src)};
@@ -132,7 +132,6 @@ static inline RT_API_ATTRS void *memcpy(
   while (count--) {
     *to++ = *from++;
   }
-  return dest;
 }
 #else
 using std::memcpy;
@@ -174,18 +173,12 @@ using std::memmove;
 #endif // !STD_MEMMOVE_UNSUPPORTED
 
 using MemmoveFct = void *(*)(void *, const void *, std::size_t);
-using MemcpyFct = void *(*)(void *, const void *, std::size_t);
 
 #ifdef RT_DEVICE_COMPILATION
 [[maybe_unused]] static RT_API_ATTRS void *MemmoveWrapper(
     void *dest, const void *src, std::size_t count) {
   return Fortran::runtime::memmove(dest, src, count);
 }
-[[maybe_unused]] static RT_API_ATTRS void *MemcpyWrapper(
-    void *dest, const void *src, std::size_t count) {
-  Fortran::runtime::memcpy(dest, src, count);
-  return dest;
-}
 #endif
 
 #if STD_STRLEN...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/174033