[flang-commits] [flang] [flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortran device arrays (PR #185984)

Thu Mar 12 09:58:40 PDT 2026

https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/185984

>From cb12f6a92f8ed89ed71ac119abb8b457f22e1455 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 11 Mar 2026 14:27:39 -0700
Subject: [PATCH 1/4] Use cuf.alloc for privatization of CUDA Fortran device
 arrays

---
 .../Lower/Support/PrivateReductionUtils.cpp   | 57 +++++++++++++++++++
 ...elayed-privatization-cuda-device-array.cuf | 31 ++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d1a965d288cad..26cbaa93c0f23 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -14,6 +14,7 @@
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/Allocatable.h"
+#include "flang/Lower/CUDA.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/Character.h"
@@ -21,12 +22,14 @@
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Semantics/symbol.h"
+#include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 
@@ -530,6 +533,60 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     if (shouldAllocateTempOnStack())
       return createStackTempFromMold(loc, builder, source);
 
+    // For CUDA device arrays that require special allocation (device,
+    // managed, unified, etc.), use cuf.alloc instead of fir.allocmem so
+    // that the private copy lives in device memory.
+    if (sym && Fortran::semantics::NeedCUDAAlloc(sym->GetUltimate())) {
+      cuf::DataAttributeAttr dataAttr =
+          Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+                                                          sym->GetUltimate());
+      mlir::Type sequenceType =
+          hlfir::getFortranElementOrSequenceType(source.getType());
+      mlir::Value shape = hlfir::genShape(loc, builder, source);
+      auto extents = hlfir::getIndexExtents(loc, builder, shape);
+      llvm::SmallVector<mlir::Value> elidedExtents =
+          fir::factory::elideExtentsAlreadyInType(sequenceType, extents);
+      llvm::SmallVector<mlir::Value> elidedLenParams =
+          fir::factory::elideLengthsAlreadyInType(sequenceType, lenParams);
+      auto idxTy = builder.getIndexType();
+      for (mlir::Value &ext : elidedExtents)
+        ext = builder.createConvert(loc, idxTy, ext);
+      auto allocOp = cuf::AllocOp::create(builder, loc, sequenceType,
+                                          /*uniqName=*/"",
+                                          /*bindcName=*/".tmp", dataAttr,
+                                          elidedLenParams, elidedExtents);
+      auto declareOp = hlfir::DeclareOp::create(
+          builder, loc, allocOp.getResult(), ".tmp", shape, lenParams,
+          /*dummy_scope=*/nullptr, /*storage=*/nullptr,
+          /*storage_offset=*/0, fir::FortranVariableFlagsAttr{}, dataAttr);
+
+      // Create cleanup region using cuf.free for device deallocation.
+      {
+        mlir::OpBuilder::InsertionGuard guard(builder);
+        assert(cleanupRegion.empty());
+        mlir::Block *block = builder.createBlock(
+            &cleanupRegion, cleanupRegion.end(), {argType}, {loc});
+        builder.setInsertionPointToEnd(block);
+
+        mlir::Value arg = builder.loadIfRef(loc, block->getArgument(0));
+        assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
+        mlir::Value addr =
+            hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
+        mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
+        fir::IfOp ifOp = fir::IfOp::create(builder, loc, isAllocated,
+                                           /*withElseRegion=*/false);
+        builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+        cuf::FreeOp::create(builder, loc, addr, dataAttr);
+        builder.setInsertionPointAfter(ifOp);
+        if (isDoConcurrent)
+          fir::YieldOp::create(builder, loc);
+        else
+          mlir::omp::YieldOp::create(builder, loc);
+      }
+
+      return hlfir::Entity{declareOp.getBase()};
+    }
+
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
     // if needsDealloc, add cleanup region. Always
     // do this for allocatable boxes because they might have been re-allocated
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
new file mode 100644
index 0000000000000..c62f1c4173145
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
@@ -0,0 +1,31 @@
+! Test that OpenMP privatization of CUDA Fortran device arrays uses cuf.alloc
+! instead of fir.allocmem so the private copy resides in device memory.
+
+! RUN: bbc -emit-hlfir -fcuda -fopenmp %s -o - | FileCheck %s
+
+subroutine omp_private_device_array()
+  implicit none
+  integer(4), device :: a(8)
+
+  !$omp parallel private(a)
+    a(1) = 42
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER:.*]] : !fir.box<!fir.array<8xi32>> init {
+
+! CHECK-NEXT: ^bb0(%[[MOLD:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>, %[[PRIV:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
+! CHECK-NEXT:   %[[C8:.*]] = arith.constant 8 : index
+! CHECK-NEXT:   %[[SHAPE:.*]] = fir.shape %[[C8]]
+! CHECK-NEXT:   %[[ALLOC:.*]] = cuf.alloc !fir.array<8xi32> {bindc_name = ".tmp", data_attr = #cuf.cuda<device>}
+! CHECK-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {data_attr = #cuf.cuda<device>, uniq_name = ".tmp"}
+! CHECK:        fir.embox
+! CHECK:        fir.store
+! CHECK-NEXT:   omp.yield
+
+! CHECK: } dealloc {
+! CHECK-NEXT: ^bb0(%[[DEALLOC_ARG:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
+! CHECK:        cuf.free %{{.*}} {data_attr = #cuf.cuda<device>}
+! CHECK:        omp.yield
+! CHECK-NEXT: }

>From c057921e80a55a3c57cc80712162399e990d41eb Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Wed, 11 Mar 2026 15:42:04 -0700
Subject: [PATCH 2/4] Refactor: extract createCUFTempFromMold helper and
 integrate CUF cleanup into createCleanupRegion

---
 .../Lower/Support/PrivateReductionUtils.cpp   | 98 +++++++++----------
 1 file changed, 45 insertions(+), 53 deletions(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 26cbaa93c0f23..6dd639bc069be 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -42,11 +42,11 @@ static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
   return false;
 }
 
-static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
-                                mlir::Location loc, mlir::Type argType,
-                                mlir::Region &cleanupRegion,
-                                const Fortran::semantics::Symbol *sym,
-                                bool isDoConcurrent) {
+static void createCleanupRegion(
+    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+    mlir::Type argType, mlir::Region &cleanupRegion,
+    const Fortran::semantics::Symbol *sym, bool isDoConcurrent,
+    std::optional<cuf::DataAttributeAttr> cudaDataAttr = std::nullopt) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   assert(cleanupRegion.empty());
   mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
@@ -105,9 +105,14 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
         fir::IfOp::create(builder, loc, isAllocated, /*withElseRegion=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
 
-    mlir::Value cast = builder.createConvert(
-        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
-    fir::FreeMemOp::create(builder, loc, cast);
+    if (cudaDataAttr) {
+      cuf::FreeOp::create(builder, loc, addr, *cudaDataAttr);
+    } else {
+      mlir::Value cast = builder.createConvert(
+          loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())),
+          addr);
+      fir::FreeMemOp::create(builder, loc, cast);
+    }
 
     builder.setInsertionPointAfter(ifOp);
     if (isDoConcurrent)
@@ -488,6 +493,32 @@ bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
   return offloadMod && offloadMod.getIsGPU();
 }
 
+/// Create a device-allocated temporary from a mold using cuf.alloc
+static hlfir::Entity
+createCUFTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
+                      hlfir::Entity mold, cuf::DataAttributeAttr dataAttr,
+                      llvm::ArrayRef<mlir::Value> lenParams) {
+  mlir::Type sequenceType =
+      hlfir::getFortranElementOrSequenceType(mold.getType());
+  mlir::Value shape = hlfir::genShape(loc, builder, mold);
+  auto extents = hlfir::getIndexExtents(loc, builder, shape);
+  llvm::SmallVector<mlir::Value> elidedExtents =
+      fir::factory::elideExtentsAlreadyInType(sequenceType, extents);
+  llvm::SmallVector<mlir::Value> elidedLenParams =
+      fir::factory::elideLengthsAlreadyInType(sequenceType, lenParams);
+  auto idxTy = builder.getIndexType();
+  for (mlir::Value &ext : elidedExtents)
+    ext = builder.createConvert(loc, idxTy, ext);
+  auto allocOp = cuf::AllocOp::create(builder, loc, sequenceType,
+                                      /*uniqName=*/"", /*bindcName=*/".tmp",
+                                      dataAttr, elidedLenParams, elidedExtents);
+  auto declareOp = hlfir::DeclareOp::create(
+      builder, loc, allocOp.getResult(), ".tmp", shape, lenParams,
+      /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
+      fir::FortranVariableFlagsAttr{}, dataAttr);
+  return hlfir::Entity{declareOp.getBase()};
+}
+
 void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     fir::BaseBoxType boxTy, bool needsInitialization) {
   bool isAllocatableOrPointer =
@@ -540,51 +571,12 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
       cuf::DataAttributeAttr dataAttr =
           Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
                                                           sym->GetUltimate());
-      mlir::Type sequenceType =
-          hlfir::getFortranElementOrSequenceType(source.getType());
-      mlir::Value shape = hlfir::genShape(loc, builder, source);
-      auto extents = hlfir::getIndexExtents(loc, builder, shape);
-      llvm::SmallVector<mlir::Value> elidedExtents =
-          fir::factory::elideExtentsAlreadyInType(sequenceType, extents);
-      llvm::SmallVector<mlir::Value> elidedLenParams =
-          fir::factory::elideLengthsAlreadyInType(sequenceType, lenParams);
-      auto idxTy = builder.getIndexType();
-      for (mlir::Value &ext : elidedExtents)
-        ext = builder.createConvert(loc, idxTy, ext);
-      auto allocOp = cuf::AllocOp::create(builder, loc, sequenceType,
-                                          /*uniqName=*/"",
-                                          /*bindcName=*/".tmp", dataAttr,
-                                          elidedLenParams, elidedExtents);
-      auto declareOp = hlfir::DeclareOp::create(
-          builder, loc, allocOp.getResult(), ".tmp", shape, lenParams,
-          /*dummy_scope=*/nullptr, /*storage=*/nullptr,
-          /*storage_offset=*/0, fir::FortranVariableFlagsAttr{}, dataAttr);
-
-      // Create cleanup region using cuf.free for device deallocation.
-      {
-        mlir::OpBuilder::InsertionGuard guard(builder);
-        assert(cleanupRegion.empty());
-        mlir::Block *block = builder.createBlock(
-            &cleanupRegion, cleanupRegion.end(), {argType}, {loc});
-        builder.setInsertionPointToEnd(block);
-
-        mlir::Value arg = builder.loadIfRef(loc, block->getArgument(0));
-        assert(mlir::isa<fir::BaseBoxType>(arg.getType()));
-        mlir::Value addr =
-            hlfir::genVariableRawAddress(loc, builder, hlfir::Entity{arg});
-        mlir::Value isAllocated = builder.genIsNotNullAddr(loc, addr);
-        fir::IfOp ifOp = fir::IfOp::create(builder, loc, isAllocated,
-                                           /*withElseRegion=*/false);
-        builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-        cuf::FreeOp::create(builder, loc, addr, dataAttr);
-        builder.setInsertionPointAfter(ifOp);
-        if (isDoConcurrent)
-          fir::YieldOp::create(builder, loc);
-        else
-          mlir::omp::YieldOp::create(builder, loc);
-      }
-
-      return hlfir::Entity{declareOp.getBase()};
+      hlfir::Entity temp =
+          createCUFTempFromMold(loc, builder, source, dataAttr, lenParams);
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
+                          isDoConcurrent, dataAttr);
+      return temp;
     }
 
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);

>From 726922ba7471e5c04dc9645b04e473253f7a8ce0 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 12 Mar 2026 09:12:06 -0700
Subject: [PATCH 3/4] Extract the common cuf.alloc creation pattern into a
 shared helper

---
 flang/include/flang/Lower/CUDA.h              |  8 ++++
 flang/lib/Lower/CUDA.cpp                      | 18 +++++++++
 flang/lib/Lower/ConvertVariable.cpp           | 38 +++++++++----------
 .../Lower/Support/PrivateReductionUtils.cpp   | 15 ++------
 4 files changed, 48 insertions(+), 31 deletions(-)

diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 865a7c6a6fa78..83523c21380d6 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -62,6 +62,14 @@ cuf::DataAttributeAttr
 translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
                                 const Fortran::semantics::Symbol &sym);
 
+/// Create a cuf.alloc operation with extents and length parameters elided
+/// when they are already encoded in the static type.
+mlir::Value genCUFAlloc(fir::FirOpBuilder &builder, mlir::Location loc,
+                        mlir::Type type, llvm::StringRef uniqName,
+                        llvm::StringRef bindcName,
+                        cuf::DataAttributeAttr dataAttr,
+                        mlir::ValueRange lenParams, mlir::ValueRange extents);
+
 /// Check if the rhs has an implicit conversion. Return the elemental op if
 /// there is a conversion. Return null otherwise.
 std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index d8e2d829f9adf..8fd54bc6cfe2d 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -68,6 +68,24 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
   return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
 
+mlir::Value Fortran::lower::genCUFAlloc(fir::FirOpBuilder &builder,
+                                        mlir::Location loc, mlir::Type type,
+                                        llvm::StringRef uniqName,
+                                        llvm::StringRef bindcName,
+                                        cuf::DataAttributeAttr dataAttr,
+                                        mlir::ValueRange lenParams,
+                                        mlir::ValueRange extents) {
+  llvm::SmallVector<mlir::Value> elidedExtents =
+      fir::factory::elideExtentsAlreadyInType(type, extents);
+  llvm::SmallVector<mlir::Value> elidedLenParams =
+      fir::factory::elideLengthsAlreadyInType(type, lenParams);
+  auto idxTy = builder.getIndexType();
+  for (mlir::Value &ext : elidedExtents)
+    ext = builder.createConvert(loc, idxTy, ext);
+  return cuf::AllocOp::create(builder, loc, type, uniqName, bindcName, dataAttr,
+                              elidedLenParams, elidedExtents);
+}
+
 std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
 Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
   auto isCopyElementalOp = [](hlfir::ElementalOp elOp) {
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 0ededb364bfea..75f0ff947e7a3 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -585,15 +585,14 @@ fir::GlobalOp Fortran::lower::defineGlobal(
     if (details && details->init()) {
       auto sym{*details->init()};
       if (sym) // Has a procedure target.
-        createGlobalInitialization(
-            builder, global, [&](fir::FirOpBuilder &b) {
-              Fortran::lower::StatementContext stmtCtx(
-                  /*cleanupProhibited=*/true);
-              auto box{Fortran::lower::convertProcedureDesignatorInitialTarget(
-                  converter, loc, *sym)};
-              auto castTo{builder.createConvert(loc, symTy, box)};
-              fir::HasValueOp::create(b, loc, castTo);
-            });
+        createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
+          Fortran::lower::StatementContext stmtCtx(
+              /*cleanupProhibited=*/true);
+          auto box{Fortran::lower::convertProcedureDesignatorInitialTarget(
+              converter, loc, *sym)};
+          auto castTo{builder.createConvert(loc, symTy, box)};
+          fir::HasValueOp::create(b, loc, castTo);
+        });
       else { // Has NULL() target.
         createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) {
           auto box{fir::factory::createNullBoxProc(b, loc, symTy)};
@@ -760,21 +759,20 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
     cuf::DataAttributeAttr dataAttr =
         Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
                                                         ultimateSymbol);
-    llvm::SmallVector<mlir::Value> indices;
-    llvm::SmallVector<mlir::Value> elidedShape =
-        fir::factory::elideExtentsAlreadyInType(ty, shape);
-    llvm::SmallVector<mlir::Value> elidedLenParams =
-        fir::factory::elideLengthsAlreadyInType(ty, lenParams);
-    auto idxTy = builder.getIndexType();
-    for (mlir::Value sh : elidedShape)
-      indices.push_back(builder.createConvert(loc, idxTy, sh));
-    if (dataAttr.getValue() == cuf::DataAttribute::Shared)
+    if (dataAttr.getValue() == cuf::DataAttribute::Shared) {
+      llvm::SmallVector<mlir::Value> elidedShape =
+          fir::factory::elideExtentsAlreadyInType(ty, shape);
+      auto idxTy = builder.getIndexType();
+      llvm::SmallVector<mlir::Value> indices;
+      for (mlir::Value sh : elidedShape)
+        indices.push_back(builder.createConvert(loc, idxTy, sh));
       return cuf::SharedMemoryOp::create(builder, loc, ty, nm, symNm, lenParams,
                                          indices);
+    }
 
     if (!cuf::isCUDADeviceContext(builder.getRegion()))
-      return cuf::AllocOp::create(builder, loc, ty, nm, symNm, dataAttr,
-                                  lenParams, indices);
+      return Fortran::lower::genCUFAlloc(builder, loc, ty, nm, symNm, dataAttr,
+                                         lenParams, shape);
   }
 
   // Let the builder do all the heavy lifting.
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 6dd639bc069be..43833f03075ea 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -502,18 +502,11 @@ createCUFTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
       hlfir::getFortranElementOrSequenceType(mold.getType());
   mlir::Value shape = hlfir::genShape(loc, builder, mold);
   auto extents = hlfir::getIndexExtents(loc, builder, shape);
-  llvm::SmallVector<mlir::Value> elidedExtents =
-      fir::factory::elideExtentsAlreadyInType(sequenceType, extents);
-  llvm::SmallVector<mlir::Value> elidedLenParams =
-      fir::factory::elideLengthsAlreadyInType(sequenceType, lenParams);
-  auto idxTy = builder.getIndexType();
-  for (mlir::Value &ext : elidedExtents)
-    ext = builder.createConvert(loc, idxTy, ext);
-  auto allocOp = cuf::AllocOp::create(builder, loc, sequenceType,
-                                      /*uniqName=*/"", /*bindcName=*/".tmp",
-                                      dataAttr, elidedLenParams, elidedExtents);
+  mlir::Value alloc = Fortran::lower::genCUFAlloc(
+      builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
+      dataAttr, lenParams, extents);
   auto declareOp = hlfir::DeclareOp::create(
-      builder, loc, allocOp.getResult(), ".tmp", shape, lenParams,
+      builder, loc, alloc, ".tmp", shape, lenParams,
       /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
       fir::FortranVariableFlagsAttr{}, dataAttr);
   return hlfir::Entity{declareOp.getBase()};

>From c1886380b2b0d09c4842d88b16425377ce8608de Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Thu, 12 Mar 2026 09:58:21 -0700
Subject: [PATCH 4/4] inline createCUFTempFromMold at the single call site

---
 .../Lower/Support/PrivateReductionUtils.cpp   | 33 +++++++------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 43833f03075ea..d879a0b7e97aa 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -493,25 +493,6 @@ bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
   return offloadMod && offloadMod.getIsGPU();
 }
 
-/// Create a device-allocated temporary from a mold using cuf.alloc
-static hlfir::Entity
-createCUFTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
-                      hlfir::Entity mold, cuf::DataAttributeAttr dataAttr,
-                      llvm::ArrayRef<mlir::Value> lenParams) {
-  mlir::Type sequenceType =
-      hlfir::getFortranElementOrSequenceType(mold.getType());
-  mlir::Value shape = hlfir::genShape(loc, builder, mold);
-  auto extents = hlfir::getIndexExtents(loc, builder, shape);
-  mlir::Value alloc = Fortran::lower::genCUFAlloc(
-      builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
-      dataAttr, lenParams, extents);
-  auto declareOp = hlfir::DeclareOp::create(
-      builder, loc, alloc, ".tmp", shape, lenParams,
-      /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
-      fir::FortranVariableFlagsAttr{}, dataAttr);
-  return hlfir::Entity{declareOp.getBase()};
-}
-
 void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     fir::BaseBoxType boxTy, bool needsInitialization) {
   bool isAllocatableOrPointer =
@@ -564,8 +545,18 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
       cuf::DataAttributeAttr dataAttr =
           Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
                                                           sym->GetUltimate());
-      hlfir::Entity temp =
-          createCUFTempFromMold(loc, builder, source, dataAttr, lenParams);
+      mlir::Type sequenceType =
+          hlfir::getFortranElementOrSequenceType(source.getType());
+      mlir::Value shape = hlfir::genShape(loc, builder, source);
+      auto extents = hlfir::getIndexExtents(loc, builder, shape);
+      mlir::Value alloc = Fortran::lower::genCUFAlloc(
+          builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
+          dataAttr, lenParams, extents);
+      auto declareOp = hlfir::DeclareOp::create(
+          builder, loc, alloc, ".tmp", shape, lenParams,
+          /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
+          fir::FortranVariableFlagsAttr{}, dataAttr);
+      hlfir::Entity temp{declareOp.getBase()};
       mlir::OpBuilder::InsertionGuard guard(builder);
       createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
                           isDoConcurrent, dataAttr);