[flang-commits] [flang] [llvm] [flang][cuda] Add support for derived-type initialization on device (PR #172568)

Valentin Clement バレンタイン クレメン via flang-commits flang-commits at lists.llvm.org
Tue Dec 16 17:48:30 PST 2025


https://github.com/clementval updated https://github.com/llvm/llvm-project/pull/172568

>From 3a9e30a84c39b94b5c3c0d74b9545660cd60377d Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 16 Dec 2025 13:42:19 -0800
Subject: [PATCH 1/5] [flang][cuda] Add support for derived-type initialization
 on device

---
 flang-rt/include/flang-rt/runtime/derived.h   |  4 ++-
 .../include/flang-rt/runtime/work-queue.h     | 18 ++++++----
 flang-rt/lib/cuda/allocatable.cpp             | 12 +++----
 flang-rt/lib/cuda/pointer.cpp                 | 12 +++----
 flang-rt/lib/runtime/allocatable.cpp          |  6 ++--
 flang-rt/lib/runtime/derived.cpp              | 29 ++++++++++++----
 flang-rt/lib/runtime/pointer.cpp              |  6 ++--
 .../Optimizer/Builder/Runtime/RTBuilder.h     | 13 +++++++
 .../include/flang/Runtime/CUDA/allocatable.h  |  4 +--
 flang/include/flang/Runtime/CUDA/pointer.h    |  4 +--
 flang/include/flang/Runtime/allocatable.h     |  3 +-
 flang/include/flang/Runtime/pointer.h         |  3 +-
 flang/lib/Lower/Allocatable.cpp               | 16 +++++----
 .../Optimizer/Builder/Runtime/Allocatable.cpp |  7 ++--
 .../CUDA/CUFAllocationConversion.cpp          |  7 +++-
 flang/test/Fir/CUDA/cuda-allocate.fir         | 12 +++----
 .../CUDA/TODO/cuda-allocate-default-init.cuf  | 15 --------
 flang/test/Lower/Intrinsics/c_loc.f90         |  2 +-
 flang/test/Lower/OpenACC/acc-declare.f90      |  4 +--
 .../parallel-reduction-pointer-array.f90      |  2 +-
 .../Lower/OpenMP/wsloop-reduction-pointer.f90 |  2 +-
 flang/test/Lower/allocatable-polymorphic.f90  | 34 +++++++++----------
 flang/test/Lower/allocatable-runtime.f90      |  4 +--
 flang/test/Lower/allocate-mold.f90            |  4 +--
 flang/test/Lower/assign-statement.f90         |  2 +-
 flang/test/Lower/nullify-polymorphic.f90      |  2 +-
 flang/test/Lower/polymorphic.f90              |  2 +-
 flang/test/Lower/volatile-allocatable.f90     | 18 +++++-----
 flang/test/Transforms/lower-repack-arrays.fir |  8 ++---
 29 files changed, 144 insertions(+), 111 deletions(-)
 delete mode 100644 flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf

diff --git a/flang-rt/include/flang-rt/runtime/derived.h b/flang-rt/include/flang-rt/runtime/derived.h
index ac6962c57168c..64e23b342ea2d 100644
--- a/flang-rt/include/flang-rt/runtime/derived.h
+++ b/flang-rt/include/flang-rt/runtime/derived.h
@@ -12,6 +12,7 @@
 #define FLANG_RT_RUNTIME_DERIVED_H_
 
 #include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime::typeInfo {
 class DerivedType;
@@ -24,7 +25,8 @@ class Terminator;
 // Perform default component initialization, allocate automatic components.
 // Returns a STAT= code (0 when all's well).
 RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &,
-    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr);
+    Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr,
+    MemmoveFct memmoveFct = nullptr);
 
 // Initializes an object clone from the original object.
 // Each allocatable member of the clone is allocated with the same bounds as
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
index 7d7f8ad991a57..fe1744c7b0474 100644
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -249,12 +249,15 @@ class ElementsOverComponents : public Elementwise, public Componentwise {
 class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
                          private ElementsOverComponents {
 public:
-  RT_API_ATTRS InitializeTicket(
-      const Descriptor &instance, const typeInfo::DerivedType &derived)
+  RT_API_ATTRS InitializeTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, MemmoveFct memmoveFct)
       : ImmediateTicketRunner<InitializeTicket>{*this},
-        ElementsOverComponents{instance, derived} {}
+        ElementsOverComponents{instance, derived}, memmoveFct_{memmoveFct} {}
   RT_API_ATTRS int Begin(WorkQueue &);
   RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  MemmoveFct memmoveFct_;
 };
 
 // Initializes one derived type instance from the value of another
@@ -448,12 +451,13 @@ class WorkQueue {
 
   // APIs for particular tasks.  These can return StatOk if the work is
   // completed immediately.
-  RT_API_ATTRS int BeginInitialize(
-      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+  RT_API_ATTRS int BeginInitialize(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived, MemmoveFct memmoveFct = nullptr) {
     if (runTicketsImmediately_) {
-      return InitializeTicket{descriptor, derived}.Run(*this);
+      return InitializeTicket{descriptor, derived, memmoveFct}.Run(*this);
     } else {
-      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      StartTicket().u.emplace<InitializeTicket>(
+          descriptor, derived, memmoveFct);
       return StatContinue;
     }
   }
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index 662703dfb6321..90eae99f9cbd4 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -25,9 +25,9 @@ RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFAllocatableAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const char *sourceFile, int sourceLine, bool deviceInit) {
+  int stat{RTNAME(CUFAllocatableAllocate)(desc, stream, pinned, hasStat, errMsg,
+      sourceFile, sourceLine, deviceInit)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
@@ -43,10 +43,10 @@ int RTDEF(CUFAllocatableAllocateSync)(Descriptor &desc, int64_t *stream,
 
 int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const char *sourceFile, int sourceLine, bool deviceInit) {
   // Perform the standard allocation.
-  int stat{RTNAME(AllocatableAllocate)(
-      desc, stream, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(AllocatableAllocate)(desc, stream, hasStat, errMsg,
+      sourceFile, sourceLine, deviceInit ? &MemmoveHostToDevice : nullptr)};
   if (pinned) {
     // Set pinned according to stat. More infrastructre is needed to set it
     // closer to the actual allocation call.
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index f07b1a9b60924..eb96db26a06f5 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -24,10 +24,10 @@ RT_EXT_API_GROUP_BEGIN
 
 int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
     bool hasStat, const Descriptor *errMsg, const char *sourceFile,
-    int sourceLine) {
+    int sourceLine, bool deviceInit) {
   // Perform the standard allocation.
-  int stat{
-      RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile, sourceLine)};
+  int stat{RTNAME(PointerAllocate)(desc, hasStat, errMsg, sourceFile,
+      sourceLine, deviceInit ? &MemmoveHostToDevice : nullptr)};
   if (pinned) {
     // Set pinned according to stat. More infrastructre is needed to set it
     // closer to the actual allocation call.
@@ -38,9 +38,9 @@ int RTDEF(CUFPointerAllocate)(Descriptor &desc, int64_t *stream, bool *pinned,
 
 int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
     bool *pinned, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
-  int stat{RTNAME(CUFPointerAllocate)(
-      desc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+    const char *sourceFile, int sourceLine, bool sourceIsDevice) {
+  int stat{RTNAME(CUFPointerAllocate)(desc, stream, pinned, hasStat, errMsg,
+      sourceFile, sourceLine, deviceInit)};
 #ifndef RT_DEVICE_COMPILATION
   // Descriptor synchronization is only done when the allocation is done
   // from the host.
diff --git a/flang-rt/lib/runtime/allocatable.cpp b/flang-rt/lib/runtime/allocatable.cpp
index f724f0a20884b..09f381857a702 100644
--- a/flang-rt/lib/runtime/allocatable.cpp
+++ b/flang-rt/lib/runtime/allocatable.cpp
@@ -135,7 +135,7 @@ void RTDEF(AllocatableApplyMold)(
 
 int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
     std::int64_t *asyncObject, bool hasStat, const Descriptor *errMsg,
-    const char *sourceFile, int sourceLine) {
+    const char *sourceFile, int sourceLine, MemmoveFct memmoveFct) {
   Terminator terminator{sourceFile, sourceLine};
   if (!descriptor.IsAllocatable()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
@@ -148,8 +148,8 @@ int RTDEF(AllocatableAllocate)(Descriptor &descriptor,
       if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
         if (const auto *derived{addendum->derivedType()}) {
           if (!derived->noInitializationNeeded()) {
-            stat =
-                Initialize(descriptor, *derived, terminator, hasStat, errMsg);
+            stat = Initialize(
+                descriptor, *derived, terminator, hasStat, errMsg, memmoveFct);
           }
         }
       }
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 7e50674631624..31529716633fb 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -13,6 +13,7 @@
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
 #include "flang-rt/runtime/work-queue.h"
+#include "flang/Runtime/CUDA/memmove-function.h"
 
 namespace Fortran::runtime {
 
@@ -32,9 +33,9 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
     const typeInfo::DerivedType &derived, Terminator &terminator, bool,
-    const Descriptor *) {
+    const Descriptor *, MemmoveFct memmoveFct) {
   WorkQueue workQueue{terminator};
-  int status{workQueue.BeginInitialize(instance, derived)};
+  int status{workQueue.BeginInitialize(instance, derived, memmoveFct)};
   return status == StatContinue ? workQueue.Run() : status;
 }
 
@@ -72,7 +73,11 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
       std::size_t bytes{component_->SizeInBytes(instance_)};
-      runtime::memcpy(rawComponent, init, bytes);
+      if (!memmoveFct_) {
+        runtime::memcpy(rawComponent, init, bytes);
+      } else {
+        memmoveFct_(rawComponent, init, bytes);
+      }
     } else if (component_->genre() == typeInfo::Component::Genre::Pointer ||
         component_->genre() == typeInfo::Component::Genre::PointerDevice) {
       // Data pointers without explicit initialization are established
@@ -110,20 +115,32 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
             chunk = done;
           }
           char *uninitialized{rawInstance + done * *stride};
-          runtime::memcpy(uninitialized, rawInstance, chunk * *stride);
+          if (!memmoveFct_) {
+            runtime::memcpy(uninitialized, rawInstance, chunk * *stride);
+          } else {
+            memmoveFct_(uninitialized, rawInstance, chunk * *stride);
+          }
           done += chunk;
         }
       } else {
         for (std::size_t done{1}; done < elements_; ++done) {
           char *uninitialized{rawInstance + done * *stride};
-          runtime::memcpy(uninitialized, rawInstance, elementBytes);
+          if (!memmoveFct_) {
+            runtime::memcpy(uninitialized, rawInstance, elementBytes);
+          } else {
+            memmoveFct_(uninitialized, rawInstance, elementBytes);
+          }
         }
       }
     } else { // one at a time with subscription
       for (Elementwise::Advance(); !Elementwise::IsComplete();
           Elementwise::Advance()) {
         char *element{instance_.Element<char>(subscripts_)};
-        runtime::memcpy(element, rawInstance, elementBytes);
+        if (!memmoveFct_) {
+          runtime::memcpy(element, rawInstance, elementBytes);
+        } else {
+          memmoveFct_(element, rawInstance, elementBytes);
+        }
       }
     }
   }
diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp
index f8ada65541a1a..520d2b92f02e5 100644
--- a/flang-rt/lib/runtime/pointer.cpp
+++ b/flang-rt/lib/runtime/pointer.cpp
@@ -157,7 +157,8 @@ RT_API_ATTRS void *AllocateValidatedPointerPayload(
 }
 
 int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
-    const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+    const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+    MemmoveFct memmoveFct) {
   Terminator terminator{sourceFile, sourceLine};
   if (!pointer.IsPointer()) {
     return ReturnError(terminator, StatInvalidDescriptor, errMsg, hasStat);
@@ -179,7 +180,8 @@ int RTDEF(PointerAllocate)(Descriptor &pointer, bool hasStat,
   if (const DescriptorAddendum * addendum{pointer.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
       if (!derived->noInitializationNeeded()) {
-        stat = Initialize(pointer, *derived, terminator, hasStat, errMsg);
+        stat = Initialize(
+            pointer, *derived, terminator, hasStat, errMsg, memmoveFct);
       }
     }
   }
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 98d7de81c7f08..3a3f629fbf5c3 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -252,6 +252,19 @@ constexpr TypeBuilderFunc getModel<void (*)(int)>() {
   };
 }
 template <>
+constexpr TypeBuilderFunc
+getModel<void *(*)(void *, const void *, unsigned long)>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    auto voidPtrTy =
+        fir::LLVMPointerType::get(context, mlir::IntegerType::get(context, 8));
+    auto unsignedLongTy =
+        mlir::IntegerType::get(context, 8 * sizeof(unsigned long));
+    auto funcTy = mlir::FunctionType::get(
+        context, {voidPtrTy, voidPtrTy, unsignedLongTy}, {voidPtrTy});
+    return fir::LLVMPointerType::get(context, funcTy);
+  };
+}
+template <>
 constexpr TypeBuilderFunc getModel<void **>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return fir::ReferenceType::get(
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 97f24bc34bfb8..d5a649594ae92 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -20,14 +20,14 @@ extern "C" {
 int RTDECL(CUFAllocatableAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index b845fd59114d4..4e49691d127e1 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -20,14 +20,14 @@ extern "C" {
 int RTDECL(CUFPointerAllocate)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor with synchronization of it when
 /// necessary.
 int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
     bool *pinned = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, bool deviceInit = false);
 
 /// Perform allocation of the descriptor without synchronization. Assign data
 /// from source.
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index 863c07494e7c3..df0a79160cbfc 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -13,6 +13,7 @@
 
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
+#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime {
 
@@ -97,7 +98,7 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 int RTDECL(AllocatableAllocate)(Descriptor &,
     std::int64_t *asyncObject = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, MemmoveFct memmoveFct = nullptr);
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 6787ef3ece232..2812a8619796f 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -14,6 +14,7 @@
 
 #include "flang/Runtime/descriptor-consts.h"
 #include "flang/Runtime/entry-names.h"
+#include "flang/Runtime/freestanding-tools.h"
 
 namespace Fortran::runtime {
 extern "C" {
@@ -90,7 +91,7 @@ int RTDECL(PointerCheckLengthParameter)(Descriptor &,
 // Performs all necessary coarray synchronization and validation actions.
 int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0);
+    int sourceLine = 0, MemmoveFct memmoveFct = nullptr);
 int RTDECL(PointerAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 030439550cd15..0996e30dfb553 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -192,6 +192,7 @@ static mlir::Value genRuntimeAllocate(fir::FirOpBuilder &builder,
   args.push_back(errorManager.errMsgAddr);
   args.push_back(errorManager.sourceFile);
   args.push_back(errorManager.sourceLine);
+  args.push_back(builder.createBool(loc, false));
   const auto convertedArgs = fir::runtime::createArguments(
       builder, loc, callee.getFunctionType(), args);
   return fir::CallOp::create(builder, loc, callee, convertedArgs).getResult(0);
@@ -773,13 +774,14 @@ class AllocateStmtHelper {
                               ErrorManager &errorManager,
                               const Fortran::semantics::Symbol &sym) {
 
-    if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
-      if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
-              declTypeSpec->AsDerived())
-        if (derivedTypeSpec->HasDefaultInitialization(
-                /*ignoreAllocatable=*/true, /*ignorePointer=*/true))
-          TODO(loc,
-               "CUDA Fortran: allocate on device with default initialization");
+    // if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
+    //   if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
+    //           declTypeSpec->AsDerived())
+    //     if (derivedTypeSpec->HasDefaultInitialization(
+    //             /*ignoreAllocatable=*/true, /*ignorePointer=*/true))
+    //       TODO(loc,
+    //            "CUDA Fortran: allocate on device with default
+    //            initialization");
 
     Fortran::lower::StatementContext stmtCtx;
     cuf::DataAttributeAttr cudaAttr =
diff --git a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
index cc9f8280a172c..89f5f45356ae5 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Allocatable.cpp
@@ -86,8 +86,9 @@ void fir::runtime::genAllocatableAllocate(fir::FirOpBuilder &builder,
     mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
     errMsg = fir::AbsentOp::create(builder, loc, boxNoneTy).getResult();
   }
-  llvm::SmallVector<mlir::Value> args{
-      fir::runtime::createArguments(builder, loc, fTy, desc, asyncObject,
-                                    hasStat, errMsg, sourceFile, sourceLine)};
+  mlir::Value deviceInit = builder.createBool(loc, false);
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, asyncObject, hasStat, errMsg, sourceFile,
+      sourceLine, deviceInit)};
   fir::CallOp::create(builder, loc, func, args);
 }
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
index 4444fc61239ea..4e2bcb6e95aee 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
@@ -128,9 +128,14 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
       mlir::Value stream =
           op.getStream() ? op.getStream()
                          : builder.createNullConstant(loc, fTy.getInput(1));
+      mlir::Value deviceInit =
+          (op.getDataAttrAttr() &&
+           op.getDataAttrAttr().getValue() == cuf::DataAttribute::Device)
+              ? builder.createBool(loc, true)
+              : builder.createBool(loc, false);
       args = fir::runtime::createArguments(builder, loc, fTy, op.getBox(),
                                            stream, pinned, hasStat, errmsg,
-                                           sourceFile, sourceLine);
+                                           sourceFile, sourceLine, deviceInit);
     }
   } else {
     args =
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 5184561a03e67..98700aee6710a 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -19,7 +19,7 @@ func.func @_QPsub1() {
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 // CHECK: %[[DECL_DESC:.*]]:2 = hlfir.declare %[[DESC]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: %{{.*}} = fir.call @_FortranAAllocatableDeallocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -47,7 +47,7 @@ func.func @_QPsub3() {
 // CHECK: %[[A:.*]]:2 = hlfir.declare %[[A_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 // CHECK: %[[A_BOX:.*]] = fir.convert %[[A]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFAllocatableDeallocate(%[[A_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -87,7 +87,7 @@ func.func @_QPsub5() {
 }
 
 // CHECK-LABEL: func.func @_QPsub5()
-// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate({{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 // CHECK: fir.call @_FortranAAllocatableDeallocate({{.*}}) : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 
 
@@ -118,7 +118,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
 // CHECK: %[[B:.*]]:2 = hlfir.declare %[[B_ADDR]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMdataEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 // CHECK: _FortranAAllocatableSetBounds
 // CHECK: %[[B_BOX:.*]] = fir.convert %[[B]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocateSync(%[[B_BOX]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 
 func.func @_QPallocate_source() {
@@ -180,7 +180,7 @@ func.func @_QQallocate_stream() {
 // CHECK-LABEL: func.func @_QQallocate_stream() 
 // CHECK: %[[STREAM_ALLOCA:.*]] = fir.alloca i64 {bindc_name = "stream1", uniq_name = "_QFEstream1"}
 // CHECK: %[[STREAM:.*]] = fir.declare %[[STREAM_ALLOCA]] {uniq_name = "_QFEstream1"} : (!fir.ref<i64>) -> !fir.ref<i64>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 
 func.func @_QPp_alloc() {
@@ -269,6 +269,6 @@ func.func @_QQpinned() attributes {fir.bindc_name = "testasync"} {
 // CHECK: %[[PINNED:.*]] = fir.alloca !fir.logical<4> {bindc_name = "pinnedflag", uniq_name = "_QFEpinnedflag"}
 // CHECK: %[[DECL_PINNED:.*]] = fir.declare %[[PINNED]] {uniq_name = "_QFEpinnedflag"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
 // CHECK: %[[CONV_PINNED:.*]] = fir.convert %[[DECL_PINNED]] : (!fir.ref<!fir.logical<4>>) -> !fir.ref<i1>
-// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK: fir.call @_FortranACUFAllocatableAllocate(%{{.*}}, %{{.*}}, %[[CONV_PINNED]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 } // end of module
diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf
deleted file mode 100644
index f68a9aa2aee53..0000000000000
--- a/flang/test/Lower/CUDA/TODO/cuda-allocate-default-init.cuf
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s
-
-program test
-implicit none
-
-type :: t1
-   real(4) :: x_fin(1:10) = acos(-1.0_4)
-end type t1
-
-type(t1), allocatable, device :: t(:)
-
-! CHECK: not yet implemented: CUDA Fortran: allocate on device with default initialization
-allocate(t(1:2))
-
-end program
diff --git a/flang/test/Lower/Intrinsics/c_loc.f90 b/flang/test/Lower/Intrinsics/c_loc.f90
index ecd5ce590fd5d..da901576a982a 100644
--- a/flang/test/Lower/Intrinsics/c_loc.f90
+++ b/flang/test/Lower/Intrinsics/c_loc.f90
@@ -179,7 +179,7 @@ subroutine c_loc_arraysection()
 ! CHECK:         fir.store %[[VAL_7:.*]] to %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:         %[[VAL_8:.*]] = fir.convert %[[VAL_0:.*]] : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:         %[[VAL_9:.*]] = fir.convert %[[VAL_5:.*]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-! CHECK:         %[[VAL_10:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_8:.*]], %[[VAL_false:.*]], %[[VAL_4:.*]], %[[VAL_9:.*]], %[[C_LN:.*]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:         %[[VAL_10:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_8:.*]], %[[VAL_false:.*]], %[[VAL_4:.*]], %[[VAL_9:.*]], %[[C_LN:.*]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:         %[[VAL_11:.*]] = fir.load %[[VAL_0:.*]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:         %[[VAL_12:.*]] = fir.box_addr %[[VAL_11:.*]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK:         fir.store %[[VAL_12:.*]] to %[[VAL_1:.*]] : !fir.ref<!fir.ptr<i32>>
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 98f7ed39edb41..12c32c58f86b9 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -434,6 +434,6 @@ subroutine init()
 end module
 
 ! CHECK-LABEL: func.func @_QMacc_declare_post_action_statPinit()
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEx_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK: fir.if
-! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAAllocatableAllocate({{.*}}) fastmath<contract> {acc.declare_action = #acc.declare_action<postAlloc = @_QMacc_declare_post_action_statEy_acc_declare_post_alloc>} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index 3de2ba8f61f8e..0a157a47e894f 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -104,7 +104,7 @@ program reduce
 ! CHECK:           fir.call @_FortranAPointerSetBounds(%[[VAL_15]], %[[VAL_14]], %[[VAL_16]], %[[VAL_17]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_3]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.char<{{.*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_21:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_19]], %[[VAL_4]], %[[VAL_5]], %[[VAL_20]], %[[VAL_7]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %[[VAL_21:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_19]], %[[VAL_4]], %[[VAL_5]], %[[VAL_20]], %[[VAL_7]], {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           omp.parallel {
 ! CHECK:             %[[VAL_24:.*]] = arith.constant 0 : i32
 ! CHECK:             %[[VAL_25:.*]] = arith.constant 10 : i32
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
index f640f5caddf76..3184a510c0858 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
@@ -81,7 +81,7 @@ program reduce_pointer
 ! CHECK:           fir.store %[[VAL_11]] to %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.char<{{.*}}>>) -> !fir.ref<i8>
-! CHECK:           %[[VAL_14:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_12]], %[[VAL_6]], %[[VAL_7]], %[[VAL_13]], %[[VAL_9]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %[[VAL_14:.*]] = fir.call @_FortranAPointerAllocate(%[[VAL_12]], %[[VAL_6]], %[[VAL_7]], %[[VAL_13]], %[[VAL_9]], {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %[[VAL_15:.*]] = arith.constant 0 : i32
 ! CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
 ! CHECK:           %[[VAL_17:.*]] = fir.box_addr %[[VAL_16]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
diff --git a/flang/test/Lower/allocatable-polymorphic.f90 b/flang/test/Lower/allocatable-polymorphic.f90
index d528fd8e546ff..8ae59df68dd8b 100644
--- a/flang/test/Lower/allocatable-polymorphic.f90
+++ b/flang/test/Lower/allocatable-polymorphic.f90
@@ -104,7 +104,7 @@ subroutine test_pointer()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[P_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[P_DESC_CAST:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[P_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[P_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! call p%proc1()
 ! CHECK: %[[P_LOAD:.*]] = fir.load %[[P_DESC:.*]] : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
@@ -117,7 +117,7 @@ subroutine test_pointer()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C1_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C1_DESC_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C1_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C1_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK-DAG: %[[C2_DESC_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -126,7 +126,7 @@ subroutine test_pointer()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[C2_DESC_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[CORANK]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C2_DESC_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C2_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C2_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! call c1%proc1()
 ! CHECK: %[[C1_DESC_LOAD:.*]] = fir.load %[[C1_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
@@ -155,7 +155,7 @@ subroutine test_pointer()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAPointerSetBounds(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK-DAG: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -166,7 +166,7 @@ subroutine test_pointer()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAPointerSetBounds(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK-LABEL: fir.do_loop
 ! CHECK: %[[C3_LOAD:.*]] = fir.load %[[C3_DECL]]#0 : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
@@ -267,7 +267,7 @@ subroutine test_allocatable()
 ! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[P_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[P_CAST:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[P_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK-DAG: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -276,7 +276,7 @@ subroutine test_allocatable()
 ! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C1_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C1_CAST:.*]] = fir.convert %[[C1_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C1_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK-DAG: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
@@ -285,7 +285,7 @@ subroutine test_allocatable()
 ! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitDerivedForAllocate(%[[C2_CAST]], %[[TYPE_DESC_P2_CAST]], %[[RANK]], %[[C0]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[C2_CAST:.*]] = fir.convert %[[C2_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C2_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[TYPE_DESC_P1:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK-DAG: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -300,7 +300,7 @@ subroutine test_allocatable()
 ! CHECK-DAG: %[[C10_I64:.*]] = fir.convert %[[C10]] : (i32) -> i64
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C3_CAST]], %[[C0]], %[[C1_I64]], %[[C10_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C3_CAST:.*]] = fir.convert %[[C3_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C3_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[TYPE_DESC_P2:.*]] = fir.type_desc !fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>
 ! CHECK-DAG: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
@@ -316,7 +316,7 @@ subroutine test_allocatable()
 ! CHECK-DAG: %[[C20_I64:.*]] = fir.convert %[[C20]] : (i32) -> i64
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[C4_CAST]], %[[C0]], %[[C1_I64]], %[[C20_I64]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[C4_CAST:.*]] = fir.convert %[[C4_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[C4_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[C1_LOAD1:.*]] = fir.load %[[C1_DECL]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
 ! CHECK: fir.dispatch "proc1"(%[[C1_LOAD1]] : !fir.class<!fir.heap<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>)
@@ -390,7 +390,7 @@ subroutine test_unlimited_polymorphic_with_intrinsic_type_spec()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK-DAG: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK-DAG: %[[CAT:.*]] = arith.constant 2 : i32
@@ -399,7 +399,7 @@ subroutine test_unlimited_polymorphic_with_intrinsic_type_spec()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAPointerNullifyIntrinsic(%[[BOX_NONE]], %[[CAT]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 ! CHECK: %[[NULL_TYPE_DESC:.*]] = fir.zero_bits !fir.ref<none>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[PTR_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<none>>>) -> !fir.ref<!fir.box<none>>
@@ -423,7 +423,7 @@ subroutine test_type_with_polymorphic_pointer_component()
 ! CHECK: %[[TYPE_PTR:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>> {bindc_name = "a", uniq_name = "_QMpolyFtest_type_with_polymorphic_pointer_componentEa"}
 ! CHECK: %[[TYPE_PTR_DECL:.*]]:2 = hlfir.declare %[[TYPE_PTR]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_type_with_polymorphic_pointer_componentEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>>>)
 ! CHECK: %[[TYPE_PTR_CONV:.*]] = fir.convert %[[TYPE_PTR_DECL]]#0 : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: fir.call @_FortranAPointerAllocate(%[[TYPE_PTR_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: fir.call @_FortranAPointerAllocate(%[[TYPE_PTR_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK: %[[TYPE_PTR_LOAD:.*]] = fir.load %[[TYPE_PTR_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>>>
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[TYPE_PTR_LOAD]] : (!fir.box<!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>>) -> !fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>
 ! CHECK: %[[ELEMENT:.*]] = hlfir.designate %[[BOX_ADDR]]{"element"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ptr<!fir.type<_QMpolyTwith_alloc{element:!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>}>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>
@@ -437,7 +437,7 @@ subroutine test_type_with_polymorphic_pointer_component()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAPointerNullifyDerived(%[[ELEMENT_DESC_CAST]], %[[TYPE_DESC_P1_CAST]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK: %[[ELEMENT_DESC_CAST:.*]] = fir.convert %[[ELEMENT]] : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[ELEMENT_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[ELEMENT_DESC_CAST]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
   subroutine test_allocate_with_mold()
     type(p2) :: x(10)
@@ -465,14 +465,14 @@ subroutine test_allocate_with_mold()
 ! CHECK: %[[X_BOX_NONE:.*]] = fir.convert %[[EMBOX_X]] : (!fir.box<!fir.array<10x!fir.type<_QMpolyTp2{p1:!fir.type<_QMpolyTp1{a:i32,b:i32}>,c:i32}>>>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAPointerApplyMold(%[[P_BOX_NONE]], %[[X_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[P_BOX_NONE:.*]] = fir.convert %[[P_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[P_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[P_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK: %[[EMBOX_I:.*]] = fir.embox %[[I_DECL]]#0(%{{.*}}) : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<20xi32>>
 ! CHECK: %[[RANK:.*]] = arith.constant 1 : i32
 ! CHECK: %[[UP_BOX_NONE:.*]] = fir.convert %[[UP_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[I_BOX_NONE:.*]] = fir.convert %[[EMBOX_I]] : (!fir.box<!fir.array<20xi32>>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAPointerApplyMold(%[[UP_BOX_NONE]], %[[I_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[UP_BOX_NONE:.*]] = fir.convert %[[UP_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?xnone>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[UP_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%[[UP_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
   subroutine test_allocate_with_source()
     type(p2) :: x(10)
@@ -573,7 +573,7 @@ subroutine test_allocatable_up_character()
 ! CHECK-DAG: %[[CORANK:.*]] = arith.constant 0 : i32
 ! CHECK: fir.call @_FortranAAllocatableInitCharacterForAllocate(%[[A_NONE]], %[[LEN]], %[[KIND]], %[[RANK]], %[[CORANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
 ! CHECK: %[[A_NONE:.*]] = fir.convert %[[A_DECL]]#0 : (!fir.ref<!fir.class<!fir.heap<none>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 end module
 
diff --git a/flang/test/Lower/allocatable-runtime.f90 b/flang/test/Lower/allocatable-runtime.f90
index c63252c68974e..e9e2da76bd788 100644
--- a/flang/test/Lower/allocatable-runtime.f90
+++ b/flang/test/Lower/allocatable-runtime.f90
@@ -31,7 +31,7 @@ subroutine foo()
   ! CHECK: fir.call @{{.*}}AllocatableSetBounds(%[[xBoxCast2]], %c0{{.*}}, %[[xlbCast]], %[[xubCast]]) {{.*}}: (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
   ! CHECK-DAG: %[[xBoxCast3:.*]] = fir.convert %[[xBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   ! CHECK-DAG: %[[sourceFile:.*]] = fir.convert %{{.*}} -> !fir.ref<i8>
-  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+  ! CHECK: fir.call @{{.*}}AllocatableAllocate(%[[xBoxCast3]], %{{.*}}, %false{{.*}}, %[[errMsg]], %[[sourceFile]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
   ! Simply check that we are emitting the right numebr of set bound for y and z. Otherwise, this is just like x.
   ! CHECK: fir.convert %[[yBoxAddr]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
@@ -180,4 +180,4 @@ subroutine mold_allocation()
 ! CHECK: %[[M_BOX_NONE:.*]] = fir.convert %[[EMBOX_M]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_BOX_NONE]], %[[M_BOX_NONE]], %[[RANK]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_BOX_NONE:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
diff --git a/flang/test/Lower/allocate-mold.f90 b/flang/test/Lower/allocate-mold.f90
index 9427c8b08786f..f3a577a135693 100644
--- a/flang/test/Lower/allocate-mold.f90
+++ b/flang/test/Lower/allocate-mold.f90
@@ -16,7 +16,7 @@ subroutine scalar_mold_allocation()
 ! CHECK: %[[A_REF_BOX_NONE1:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold(%[[A_REF_BOX_NONE1]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK: %[[A_REF_BOX_NONE2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[A_REF_BOX_NONE2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 
 subroutine array_scalar_mold_allocation()
   real, allocatable :: a(:)
@@ -40,4 +40,4 @@ end subroutine array_scalar_mold_allocation
 ! CHECK: %[[REF_BOX_A1:.*]] = fir.convert %1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: fir.call @_FortranAAllocatableSetBounds(%[[REF_BOX_A1]], {{.*}},{{.*}}, {{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK: %[[REF_BOX_A2:.*]] = fir.convert %[[A]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[REF_BOX_A2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}} -> i32
diff --git a/flang/test/Lower/assign-statement.f90 b/flang/test/Lower/assign-statement.f90
index 342355bf469c8..f0691b324ffd0 100644
--- a/flang/test/Lower/assign-statement.f90
+++ b/flang/test/Lower/assign-statement.f90
@@ -7,6 +7,6 @@ program main
 
   allocate(ip)
   assign 10 to ip
-  ! CHECK: fir.store %c10_i32 to %11 : !fir.ptr<i32>
+  ! CHECK: fir.store %c10_i32 to %{{.*}} : !fir.ptr<i32>
   10 return
   end program main
diff --git a/flang/test/Lower/nullify-polymorphic.f90 b/flang/test/Lower/nullify-polymorphic.f90
index 9a6a2795d3057..ea18c08b034e2 100644
--- a/flang/test/Lower/nullify-polymorphic.f90
+++ b/flang/test/Lower/nullify-polymorphic.f90
@@ -44,7 +44,7 @@ program test
 ! CHECK-LABEL: func.func @_QMpolyPtest_nullify()
 ! CHECK: %[[C_DESC:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>> {bindc_name = "c", uniq_name = "_QMpolyFtest_nullifyEc"}
 ! CHECK: %[[C_DESC_DECL:.*]]:2 = hlfir.declare %[[C_DESC]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_nullifyEc"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>)
-! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK: %[[DECLARED_TYPE_DESC:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK: %[[C_DESC_CAST:.*]] = fir.convert %[[C_DESC_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[TYPE_DESC_CAST:.*]] = fir.convert %[[DECLARED_TYPE_DESC]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index 689e2233dcf10..9268eae398971 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -1156,7 +1156,7 @@ program test
 ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK: %[[O:.*]] = fir.load %[[ADDR_O]] : !fir.ref<!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>>
 ! CHECK: %[[COORD_INNER:.*]] = fir.coordinate_of %[[O]], inner : (!fir.box<!fir.heap<!fir.type<_QMpolymorphic_testTouter{inner:!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>}>>>) -> !fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: %{{.*}} = fir.do_loop %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%arg1 = %{{.*}}) -> (!fir.array<5x!fir.logical<4>>) {
diff --git a/flang/test/Lower/volatile-allocatable.f90 b/flang/test/Lower/volatile-allocatable.f90
index 3147698114115..4c0bea60ecaa1 100644
--- a/flang/test/Lower/volatile-allocatable.f90
+++ b/flang/test/Lower/volatile-allocatable.f90
@@ -124,15 +124,15 @@ subroutine test_unlimited_polymorphic()
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_scalar_volatileEv2"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_scalar_volatileEv3"} : (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>) -> (!fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>, !fir.ref<!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_scalar_volatileEv1"} : (!fir.box<!fir.heap<!fir.type<{{.*}}>>, volatile>) -> (!fir.box<!fir.type<{{.*}}>, volatile>, !fir.box<!fir.type<{{.*}}>, volatile>)
 ! CHECK:           %{{.+}} = hlfir.designate %{{.+}}#0{"j"}   : (!fir.box<!fir.type<{{.*}}>, volatile>) -> !fir.ref<i32, volatile>
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro._QMderived_typesText_type.0"} : (!fir.ref<!fir.type<{{.*}}>>) -> (!fir.ref<!fir.type<{{.*}}>>, !fir.ref<!fir.type<{{.*}}>>)
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocateSource(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} typeparams %{{.+}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX766F6C6174696C6520636861726163746572"} : (!fir.ref<!fir.char<1,18>>, index) -> (!fir.ref<!fir.char<1,18>>, !fir.ref<!fir.char<1,18>>)
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocatePolymorphic(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -144,7 +144,7 @@ subroutine test_unlimited_polymorphic()
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, asynchronous, volatile>, uniq_name = "_QFtest_volatile_asynchronousEv1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro.4xi4.1"} : (!fir.ref<!fir.array<4xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<4xi32>>)
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocateSource(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
@@ -154,7 +154,7 @@ subroutine test_unlimited_polymorphic()
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_select_base_type_volatileEv"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitDerivedForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAClassIs(%{{.+}}, %{{.+}}) : (!fir.box<none>, !fir.ref<none>) -> i1
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_select_base_type_volatileEv"} : (!fir.class<!fir.heap<!fir.array<?x!fir.type<{{.*}}>>>, volatile>, !fir.shift<1>) -> (!fir.class<!fir.array<?x!fir.type<{{.*}}>>, volatile>, !fir.class<!fir.array<?x!fir.type<{{.*}}>>, volatile>)
 ! CHECK:           %{{.+}} = hlfir.designate %{{.+}}#0 (%{{.+}})  : (!fir.class<!fir.array<?x!fir.type<{{.*}}>>, volatile>, index) -> !fir.class<!fir.type<{{.*}}>, volatile>
@@ -170,22 +170,22 @@ subroutine test_unlimited_polymorphic()
 ! CHECK:           %{{.+}} = hlfir.designate %{{.+}}#0{"arr"}   shape %{{.+}} : (!fir.ref<!fir.type<{{.*}}>>, !fir.shape<1>) -> !fir.ref<!fir.array<2xi32>>
 ! CHECK:           fir.call @_FortranAAllocatableApplyMold(%{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
 ! CHECK-LABEL:   func.func @_QPtest_unlimited_polymorphic() {
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_unlimited_polymorphicEup"} : (!fir.ref<!fir.class<!fir.heap<none>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<none>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<none>, volatile>, volatile>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<allocatable, volatile>, uniq_name = "_QFtest_unlimited_polymorphicEupa"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>, volatile>, volatile>) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>, volatile>, volatile>, !fir.ref<!fir.class<!fir.heap<!fir.array<?xnone>>, volatile>, volatile>)
 ! CHECK:           fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_unlimited_polymorphicEup"} : (!fir.heap<i32>) -> (!fir.heap<i32>, !fir.heap<i32>)
 ! CHECK:           fir.call @_FortranAAllocatableInitCharacterForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i64, i32, i32, i32) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} typeparams %{{.+}} {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_unlimited_polymorphicEup"} : (!fir.heap<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.heap<!fir.char<1,?>>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}} typeparams %{{.+}} {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQclX636C617373282A29"} : (!fir.ref<!fir.char<1,8>>, index) -> (!fir.ref<!fir.char<1,8>>, !fir.ref<!fir.char<1,8>>)
 ! CHECK:           fir.call @_FortranAAllocatableInitIntrinsicForAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i32, i32, i32) -> ()
 ! CHECK:           fir.call @_FortranAAllocatableSetBounds(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableAllocate(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFtest_unlimited_polymorphicEupa"} : (!fir.box<!fir.heap<!fir.array<?xf32>>, volatile>, !fir.shift<1>) -> (!fir.box<!fir.array<?xf32>, volatile>, !fir.box<!fir.array<?xf32>, volatile>)
 ! CHECK:           %{{.+}}:2 = hlfir.declare %{{.+}}(%{{.+}}) {fortran_attrs = #fir.var_attrs<parameter>, uniq_name = "_QQro.3xr4.3"} : (!fir.ref<!fir.array<3xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<3xf32>>, !fir.ref<!fir.array<3xf32>>)
 ! CHECK:           %{{.+}} = fir.call @_FortranAAllocatableDeallocatePolymorphic(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir
index 9232a74f224d3..9506cc19ac993 100644
--- a/flang/test/Transforms/lower-repack-arrays.fir
+++ b/flang/test/Transforms/lower-repack-arrays.fir
@@ -817,7 +817,7 @@ func.func @_QPtest6(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {fir.bi
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<none>) -> !fir.ref<i64>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]], {{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
 // CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
@@ -894,7 +894,7 @@ func.func @_QPtest6_stack(%arg0: !fir.class<!fir.array<?x?x!fir.type<_QMmTt>>> {
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<none>) -> !fir.ref<i64>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]], {{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>
 // CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>>) -> !fir.heap<!fir.array<?x?x!fir.type<_QMmTt>>>
@@ -970,7 +970,7 @@ func.func @_QPtest7(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_name = "x
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<none>) -> !fir.ref<i64>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]], {{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.heap<!fir.array<?x?xnone>>>
 // CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.heap<!fir.array<?x?xnone>>
@@ -1047,7 +1047,7 @@ func.func @_QPtest7_stack(%arg0: !fir.class<!fir.array<?x?xnone>> {fir.bindc_nam
 // CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<none>) -> !fir.ref<i64>
 // CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
+// CHECK:               %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]], {{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i64>, i1, !fir.box<none>, !fir.ref<i8>, i32, {{.*}}) -> i32
 // CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x?xnone>>>>
 // CHECK:               %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.class<!fir.heap<!fir.array<?x?xnone>>>
 // CHECK:               %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class<!fir.heap<!fir.array<?x?xnone>>>) -> !fir.heap<!fir.array<?x?xnone>>

>From 713ae731b40fd4530386d427b309e3643fd77d0c Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 16 Dec 2025 14:42:16 -0800
Subject: [PATCH 2/5] Remove TODO

---
 flang/lib/Lower/Allocatable.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 0996e30dfb553..1912027f8742d 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -773,16 +773,6 @@ class AllocateStmtHelper {
                               const fir::MutableBoxValue &box,
                               ErrorManager &errorManager,
                               const Fortran::semantics::Symbol &sym) {
-
-    // if (const Fortran::semantics::DeclTypeSpec *declTypeSpec = sym.GetType())
-    //   if (const Fortran::semantics::DerivedTypeSpec *derivedTypeSpec =
-    //           declTypeSpec->AsDerived())
-    //     if (derivedTypeSpec->HasDefaultInitialization(
-    //             /*ignoreAllocatable=*/true, /*ignorePointer=*/true))
-    //       TODO(loc,
-    //            "CUDA Fortran: allocate on device with default
-    //            initialization");
-
     Fortran::lower::StatementContext stmtCtx;
     cuf::DataAttributeAttr cudaAttr =
         Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),

>From 48685b02d9d9c046702a82454abebffde9be07ec Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 16 Dec 2025 15:21:37 -0800
Subject: [PATCH 3/5] Set memcpy as default

---
 flang-rt/lib/runtime/derived.cpp              | 24 ++++---------------
 flang/include/flang/Runtime/allocatable.h     |  9 ++++++-
 .../flang/Runtime/freestanding-tools.h        |  4 ++++
 flang/include/flang/Runtime/pointer.h         |  8 ++++++-
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 31529716633fb..6e5d5498af177 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -73,11 +73,7 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
       std::size_t bytes{component_->SizeInBytes(instance_)};
-      if (!memmoveFct_) {
-        runtime::memcpy(rawComponent, init, bytes);
-      } else {
-        memmoveFct_(rawComponent, init, bytes);
-      }
+      memmoveFct_(rawComponent, init, bytes);
     } else if (component_->genre() == typeInfo::Component::Genre::Pointer ||
         component_->genre() == typeInfo::Component::Genre::PointerDevice) {
       // Data pointers without explicit initialization are established
@@ -115,32 +111,20 @@ RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
             chunk = done;
           }
           char *uninitialized{rawInstance + done * *stride};
-          if (!memmoveFct_) {
-            runtime::memcpy(uninitialized, rawInstance, chunk * *stride);
-          } else {
-            memmoveFct_(uninitialized, rawInstance, chunk * *stride);
-          }
+          memmoveFct_(uninitialized, rawInstance, chunk * *stride);
           done += chunk;
         }
       } else {
         for (std::size_t done{1}; done < elements_; ++done) {
           char *uninitialized{rawInstance + done * *stride};
-          if (!memmoveFct_) {
-            runtime::memcpy(uninitialized, rawInstance, elementBytes);
-          } else {
-            memmoveFct_(uninitialized, rawInstance, elementBytes);
-          }
+          memmoveFct_(uninitialized, rawInstance, elementBytes);
         }
       }
     } else { // one at a time with subscription
       for (Elementwise::Advance(); !Elementwise::IsComplete();
           Elementwise::Advance()) {
         char *element{instance_.Element<char>(subscripts_)};
-        if (!memmoveFct_) {
-          runtime::memcpy(element, rawInstance, elementBytes);
-        } else {
-          memmoveFct_(element, rawInstance, elementBytes);
-        }
+        memmoveFct_(element, rawInstance, elementBytes);
       }
     }
   }
diff --git a/flang/include/flang/Runtime/allocatable.h b/flang/include/flang/Runtime/allocatable.h
index df0a79160cbfc..f7df7802fb4b8 100644
--- a/flang/include/flang/Runtime/allocatable.h
+++ b/flang/include/flang/Runtime/allocatable.h
@@ -95,10 +95,17 @@ int RTDECL(AllocatableCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the allocatable has a
 // derived type, and is always initialized by AllocatableAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
+#ifdef RT_DEVICE_COMPILATION
 int RTDECL(AllocatableAllocate)(Descriptor &,
     std::int64_t *asyncObject = nullptr, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, MemmoveFct memmoveFct = nullptr);
+    int sourceLine = 0, MemmoveFct memmoveFct = &MemcpyWrapper);
+#else
+int RTDECL(AllocatableAllocate)(Descriptor &,
+    std::int64_t *asyncObject = nullptr, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0, MemmoveFct memmoveFct = &Fortran::runtime::memcpy);
+#endif
 int RTDECL(AllocatableAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);
diff --git a/flang/include/flang/Runtime/freestanding-tools.h b/flang/include/flang/Runtime/freestanding-tools.h
index 7ef7cc74f213b..77a42fd9901dd 100644
--- a/flang/include/flang/Runtime/freestanding-tools.h
+++ b/flang/include/flang/Runtime/freestanding-tools.h
@@ -179,6 +179,10 @@ using MemmoveFct = void *(*)(void *, const void *, std::size_t);
     void *dest, const void *src, std::size_t count) {
   return Fortran::runtime::memmove(dest, src, count);
 }
+[[maybe_unused]] static RT_API_ATTRS void MemcpyWrapper(
+    void *dest, const void *src, std::size_t count) {
+  return Fortran::runtime::memcpy(dest, src, count);
+}
 #endif
 
 #if STD_STRLEN_USE_BUILTIN
diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 2812a8619796f..88e4ef446a6c7 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -89,9 +89,15 @@ int RTDECL(PointerCheckLengthParameter)(Descriptor &,
 // Successfully allocated memory is initialized if the pointer has a
 // derived type, and is always initialized by PointerAllocateSource().
 // Performs all necessary coarray synchronization and validation actions.
+#ifdef RT_DEVICE_COMPILATION
 int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, MemmoveFct memmoveFct = nullptr);
+    int sourceLine = 0, MemmoveFct memmoveFct = &MemcpyWrapper);
+#else
+int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
+    int sourceLine = 0, MemmoveFct memmoveFct = &Fortran::runtime::memmove);
+#endif
 int RTDECL(PointerAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,
     const char *sourceFile = nullptr, int sourceLine = 0);

>From 582944b10c06c9ea71655fd0ddd8e9fb48b915c3 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 16 Dec 2025 15:22:10 -0800
Subject: [PATCH 4/5] Fix memcpy

---
 flang/include/flang/Runtime/pointer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h
index 88e4ef446a6c7..0ed17c2c668be 100644
--- a/flang/include/flang/Runtime/pointer.h
+++ b/flang/include/flang/Runtime/pointer.h
@@ -96,7 +96,7 @@ int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
 #else
 int RTDECL(PointerAllocate)(Descriptor &, bool hasStat = false,
     const Descriptor *errMsg = nullptr, const char *sourceFile = nullptr,
-    int sourceLine = 0, MemmoveFct memmoveFct = &Fortran::runtime::memmove);
+    int sourceLine = 0, MemmoveFct memmoveFct = &Fortran::runtime::memcpy);
 #endif
 int RTDECL(PointerAllocateSource)(Descriptor &, const Descriptor &source,
     bool hasStat = false, const Descriptor *errMsg = nullptr,

>From f833728d2c5eca0cf0cb79d975e822989cd81e69 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval at gmail.com>
Date: Tue, 16 Dec 2025 17:48:16 -0800
Subject: [PATCH 5/5] Try fix for windows buildbot

---
 .../flang/Optimizer/Builder/Runtime/RTBuilder.h    | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
index 3a3f629fbf5c3..960405ee0006f 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/RTBuilder.h
@@ -264,6 +264,20 @@ getModel<void *(*)(void *, const void *, unsigned long)>() {
     return fir::LLVMPointerType::get(context, funcTy);
   };
 }
+#ifdef _MSC_VER
+template <>
+constexpr TypeBuilderFunc
+getModel<void *(*)(void *, const void *, unsigned __int64)>() {
+  return [](mlir::MLIRContext *context) -> mlir::Type {
+    auto voidPtrTy =
+        fir::LLVMPointerType::get(context, mlir::IntegerType::get(context, 8));
+    auto uint64Ty = mlir::IntegerType::get(context, 64);
+    auto funcTy = mlir::FunctionType::get(
+        context, {voidPtrTy, voidPtrTy, uint64Ty}, {voidPtrTy});
+    return fir::LLVMPointerType::get(context, funcTy);
+  };
+}
+#endif
 template <>
 constexpr TypeBuilderFunc getModel<void **>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {



More information about the flang-commits mailing list