[flang-commits] [flang] 51937fc - Revert "[flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortr… (#186891)

Mon Mar 16 14:51:14 PDT 2026

Author: Zhen Wang
Date: 2026-03-16T21:51:07Z
New Revision: 51937fc9969c39bafc4991ceeb7c7113696aa7df

URL: https://github.com/llvm/llvm-project/commit/51937fc9969c39bafc4991ceeb7c7113696aa7df
DIFF: https://github.com/llvm/llvm-project/commit/51937fc9969c39bafc4991ceeb7c7113696aa7df.diff

LOG: Revert "[flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortr… (#186891)

…an device arrays (#185984)"

This reverts commit fb18d570b0466ca2a401aba11d6e58b206aebc1a.

This PR caused compilation failures with allocatable arrays, reverting
now for more investigation.

Added: 
    

Modified: 
    flang/include/flang/Lower/CUDA.h
    flang/lib/Lower/CUDA.cpp
    flang/lib/Lower/ConvertVariable.cpp
    flang/lib/Lower/Support/PrivateReductionUtils.cpp

Removed: 
    flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf


################################################################################
diff  --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 83523c21380d6..865a7c6a6fa78 100644

--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -62,14 +62,6 @@ cuf::DataAttributeAttr
 translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
                                 const Fortran::semantics::Symbol &sym);
 
-/// Create a cuf.alloc operation with extents and length parameters elided
-/// when they are already encoded in the static type.
-mlir::Value genCUFAlloc(fir::FirOpBuilder &builder, mlir::Location loc,
-                        mlir::Type type, llvm::StringRef uniqName,
-                        llvm::StringRef bindcName,
-                        cuf::DataAttributeAttr dataAttr,
-                        mlir::ValueRange lenParams, mlir::ValueRange extents);
-
 /// Check if the rhs has an implicit conversion. Return the elemental op if
 /// there is a conversion. Return null otherwise.
 std::pair<hlfir::ElementalOp, hlfir::ElementalOp>

diff  --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 8fd54bc6cfe2d..d8e2d829f9adf 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -68,24 +68,6 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
   return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
 
-mlir::Value Fortran::lower::genCUFAlloc(fir::FirOpBuilder &builder,
-                                        mlir::Location loc, mlir::Type type,
-                                        llvm::StringRef uniqName,
-                                        llvm::StringRef bindcName,
-                                        cuf::DataAttributeAttr dataAttr,
-                                        mlir::ValueRange lenParams,
-                                        mlir::ValueRange extents) {
-  llvm::SmallVector<mlir::Value> elidedExtents =
-      fir::factory::elideExtentsAlreadyInType(type, extents);
-  llvm::SmallVector<mlir::Value> elidedLenParams =
-      fir::factory::elideLengthsAlreadyInType(type, lenParams);
-  auto idxTy = builder.getIndexType();
-  for (mlir::Value &ext : elidedExtents)
-    ext = builder.createConvert(loc, idxTy, ext);
-  return cuf::AllocOp::create(builder, loc, type, uniqName, bindcName, dataAttr,
-                              elidedLenParams, elidedExtents);
-}
-
 std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
 Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
   auto isCopyElementalOp = [](hlfir::ElementalOp elOp) {

diff  --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index ec406c9997de2..0ededb364bfea 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -760,20 +760,21 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
     cuf::DataAttributeAttr dataAttr =
         Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
                                                         ultimateSymbol);
-    if (dataAttr.getValue() == cuf::DataAttribute::Shared) {
-      llvm::SmallVector<mlir::Value> elidedShape =
-          fir::factory::elideExtentsAlreadyInType(ty, shape);
-      auto idxTy = builder.getIndexType();
-      llvm::SmallVector<mlir::Value> indices;
-      for (mlir::Value sh : elidedShape)
-        indices.push_back(builder.createConvert(loc, idxTy, sh));
+    llvm::SmallVector<mlir::Value> indices;
+    llvm::SmallVector<mlir::Value> elidedShape =
+        fir::factory::elideExtentsAlreadyInType(ty, shape);
+    llvm::SmallVector<mlir::Value> elidedLenParams =
+        fir::factory::elideLengthsAlreadyInType(ty, lenParams);
+    auto idxTy = builder.getIndexType();
+    for (mlir::Value sh : elidedShape)
+      indices.push_back(builder.createConvert(loc, idxTy, sh));
+    if (dataAttr.getValue() == cuf::DataAttribute::Shared)
       return cuf::SharedMemoryOp::create(builder, loc, ty, nm, symNm, lenParams,
                                          indices);
-    }
 
     if (!cuf::isCUDADeviceContext(builder.getRegion()))
-      return Fortran::lower::genCUFAlloc(builder, loc, ty, nm, symNm, dataAttr,
-                                         lenParams, shape);
+      return cuf::AllocOp::create(builder, loc, ty, nm, symNm, dataAttr,
+                                  lenParams, indices);
   }
 
   // Let the builder do all the heavy lifting.

diff  --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 680375bee9d91..f63fb6ecfe43f 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -14,7 +14,6 @@
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/Allocatable.h"
-#include "flang/Lower/CUDA.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/Character.h"
@@ -22,14 +21,12 @@
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
-#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Semantics/symbol.h"
-#include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
 #include "llvm/Support/CommandLine.h"
@@ -49,11 +46,11 @@ static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
   return false;
 }
 
-static void createCleanupRegion(
-    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
-    mlir::Type argType, mlir::Region &cleanupRegion,
-    const Fortran::semantics::Symbol *sym, bool isDoConcurrent,
-    std::optional<cuf::DataAttributeAttr> cudaDataAttr = std::nullopt) {
+static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
+                                mlir::Location loc, mlir::Type argType,
+                                mlir::Region &cleanupRegion,
+                                const Fortran::semantics::Symbol *sym,
+                                bool isDoConcurrent) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   assert(cleanupRegion.empty());
   mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
@@ -112,14 +109,9 @@ static void createCleanupRegion(
         fir::IfOp::create(builder, loc, isAllocated, /*withElseRegion=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
 
-    if (cudaDataAttr) {
-      cuf::FreeOp::create(builder, loc, addr, *cudaDataAttr);
-    } else {
-      mlir::Value cast = builder.createConvert(
-          loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())),
-          addr);
-      fir::FreeMemOp::create(builder, loc, cast);
-    }
+    mlir::Value cast = builder.createConvert(
+        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
+    fir::FreeMemOp::create(builder, loc, cast);
 
     builder.setInsertionPointAfter(ifOp);
     if (isDoConcurrent)
@@ -555,31 +547,6 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
     if (shouldAllocateTempOnStack(boxTy))
       return createStackTempFromMold(loc, builder, source);
 
-    // For CUDA device arrays that require special allocation (device,
-    // managed, unified, etc.), use cuf.alloc instead of fir.allocmem so
-    // that the private copy lives in device memory.
-    if (sym && Fortran::semantics::NeedCUDAAlloc(sym->GetUltimate())) {
-      cuf::DataAttributeAttr dataAttr =
-          Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
-                                                          sym->GetUltimate());
-      mlir::Type sequenceType =
-          hlfir::getFortranElementOrSequenceType(source.getType());
-      mlir::Value shape = hlfir::genShape(loc, builder, source);
-      auto extents = hlfir::getIndexExtents(loc, builder, shape);
-      mlir::Value alloc = Fortran::lower::genCUFAlloc(
-          builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
-          dataAttr, lenParams, extents);
-      auto declareOp = hlfir::DeclareOp::create(
-          builder, loc, alloc, ".tmp", shape, lenParams,
-          /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
-          fir::FortranVariableFlagsAttr{}, dataAttr);
-      hlfir::Entity temp{declareOp.getBase()};
-      mlir::OpBuilder::InsertionGuard guard(builder);
-      createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
-                          isDoConcurrent, dataAttr);
-      return temp;
-    }
-
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
     // if needsDealloc, add cleanup region. Always
     // do this for allocatable boxes because they might have been re-allocated

diff  --git a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
deleted file mode 100644
index c62f1c4173145..0000000000000
--- a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
+++ /dev/null
@@ -1,31 +0,0 @@
-! Test that OpenMP privatization of CUDA Fortran device arrays uses cuf.alloc
-! instead of fir.allocmem so the private copy resides in device memory.
-
-! RUN: bbc -emit-hlfir -fcuda -fopenmp %s -o - | FileCheck %s
-
-subroutine omp_private_device_array()
-  implicit none
-  integer(4), device :: a(8)
-
-  !$omp parallel private(a)
-    a(1) = 42
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER:.*]] : !fir.box<!fir.array<8xi32>> init {
-
-! CHECK-NEXT: ^bb0(%[[MOLD:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>, %[[PRIV:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
-! CHECK-NEXT:   %[[C8:.*]] = arith.constant 8 : index
-! CHECK-NEXT:   %[[SHAPE:.*]] = fir.shape %[[C8]]
-! CHECK-NEXT:   %[[ALLOC:.*]] = cuf.alloc !fir.array<8xi32> {bindc_name = ".tmp", data_attr = #cuf.cuda<device>}
-! CHECK-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {data_attr = #cuf.cuda<device>, uniq_name = ".tmp"}
-! CHECK:        fir.embox
-! CHECK:        fir.store
-! CHECK-NEXT:   omp.yield
-
-! CHECK: } dealloc {
-! CHECK-NEXT: ^bb0(%[[DEALLOC_ARG:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
-! CHECK:        cuf.free %{{.*}} {data_attr = #cuf.cuda<device>}
-! CHECK:        omp.yield
-! CHECK-NEXT: }