[flang-commits] [flang] f57f338 - [flang][cuda] Add double descriptor information in allocate/deallocate operations (#170901)

Fri Dec 5 11:08:49 PST 2025

Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-12-05T11:08:45-08:00
New Revision: f57f338313c19ff94959203ea98e96e4ec895c51

URL: https://github.com/llvm/llvm-project/commit/f57f338313c19ff94959203ea98e96e4ec895c51
DIFF: https://github.com/llvm/llvm-project/commit/f57f338313c19ff94959203ea98e96e4ec895c51.diff

LOG: [flang][cuda] Add double descriptor information in allocate/deallocate operations (#170901)

After https://github.com/llvm/llvm-project/pull/169740, the allocate and
deallocate cuf operation can be converted later. Update the way to
recognize double descriptor case by adding this information directly on
the operation itself.

Added: 
    

Modified: 
    flang/include/flang/Lower/CUDA.h
    flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
    flang/lib/Lower/Allocatable.cpp
    flang/lib/Lower/CUDA.cpp
    flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
    flang/test/Fir/CUDA/cuda-allocate.fir
    flang/test/Lower/CUDA/cuda-allocatable.cuf

Removed: 
    


################################################################################
diff  --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index ef7cdc42d72f2..704b0356c19ed 100644

--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -66,6 +66,9 @@ translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
 /// there is a conversion. Return null otherwise.
 hlfir::ElementalOp isTransferWithConversion(mlir::Value rhs);
 
+/// Check if the value is an allocatable with double descriptor.
+bool hasDoubleDescriptor(mlir::Value);
+
 } // end namespace Fortran::lower
 
 #endif // FORTRAN_LOWER_CUDA_H

diff  --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 920bef99dc996..766a0d6bb8ee0 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -100,7 +100,8 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
       Optional<fir_ReferenceType>:$stream,
       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
-      cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat);
+      cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
+      UnitAttr:$hasDoubleDescriptor);
 
   let results = (outs AnyIntegerType:$stat);
 
@@ -126,9 +127,9 @@ def cuf_DeallocateOp : cuf_Op<"deallocate",
   }];
 
   let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
-                       cuf_DataAttributeAttr:$data_attr,
-                       UnitAttr:$hasStat);
+      Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+      cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
+      UnitAttr:$hasDoubleDescriptor);
 
   let results = (outs AnyIntegerType:$stat);
 

diff  --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index e7a6c4df40045..2ae13e2bd73fb 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -798,10 +798,12 @@ class AllocateStmtHelper {
     // Keep return type the same as a standard AllocatableAllocate call.
     mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
 
+    bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
     return cuf::AllocateOp::create(
                builder, loc, retTy, box.getAddr(), errmsg, stream, pinned,
                source, cudaAttr,
-               errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+               errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
+               doubleDescriptors ? builder.getUnitAttr() : nullptr)
         .getResult();
   }
 
@@ -865,11 +867,13 @@ static mlir::Value genCudaDeallocate(fir::FirOpBuilder &builder,
           ? nullptr
           : errorManager.errMsgAddr;
 
-  // Keep return type the same as a standard AllocatableAllocate call.
+  // Keep return type the same as a standard AllocatableDeallocate call.
   mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+  bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
   return cuf::DeallocateOp::create(
              builder, loc, retTy, box.getAddr(), errmsg, cudaAttr,
-             errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+             errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
+             doubleDescriptors ? builder.getUnitAttr() : nullptr)
       .getResult();
 }
 

diff  --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 9501b0ec60002..fb055286df46b 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -91,3 +91,17 @@ hlfir::ElementalOp Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
       return elOp;
   return {};
 }
+
+bool Fortran::lower::hasDoubleDescriptor(mlir::Value addr) {
+  if (auto declareOp =
+          mlir::dyn_cast_or_null<hlfir::DeclareOp>(addr.getDefiningOp())) {
+    if (mlir::isa_and_nonnull<fir::AddrOfOp>(
+            declareOp.getMemref().getDefiningOp())) {
+      if (declareOp.getDataAttr() &&
+          *declareOp.getDataAttr() == cuf::DataAttribute::Pinned)
+        return false;
+      return true;
+    }
+  }
+  return false;
+}

diff  --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
index 0acdb24bf62b1..2c40991580c2e 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
@@ -62,28 +62,6 @@ static inline unsigned getMemType(cuf::DataAttribute attr) {
   llvm_unreachable("unsupported memory type");
 }
 
-template <typename OpTy>
-static bool hasDoubleDescriptors(OpTy op) {
-  if (auto declareOp =
-          mlir::dyn_cast_or_null<fir::DeclareOp>(op.getBox().getDefiningOp())) {
-    if (mlir::isa_and_nonnull<fir::AddrOfOp>(
-            declareOp.getMemref().getDefiningOp())) {
-      if (isPinned(declareOp))
-        return false;
-      return true;
-    }
-  } else if (auto declareOp = mlir::dyn_cast_or_null<hlfir::DeclareOp>(
-                 op.getBox().getDefiningOp())) {
-    if (mlir::isa_and_nonnull<fir::AddrOfOp>(
-            declareOp.getMemref().getDefiningOp())) {
-      if (isPinned(declareOp))
-        return false;
-      return true;
-    }
-  }
-  return false;
-}
-
 static bool inDeviceContext(mlir::Operation *op) {
   if (op->getParentOfType<cuf::KernelOp>())
     return true;
@@ -353,7 +331,7 @@ struct CUFAllocateOpConversion
                              fir::FortranVariableFlagsEnum::pointer))
         isPointer = true;
 
-    if (hasDoubleDescriptors(op)) {
+    if (op.getHasDoubleDescriptor()) {
       // Allocation for module variable are done with custom runtime entry point
       // so the descriptors can be synchronized.
       mlir::func::FuncOp func;
@@ -406,7 +384,7 @@ struct CUFDeallocateOpConversion
     fir::FirOpBuilder builder(rewriter, mod);
     mlir::Location loc = op.getLoc();
 
-    if (hasDoubleDescriptors(op)) {
+    if (op.getHasDoubleDescriptor()) {
       // Deallocation for module variable are done with custom runtime entry
       // point so the descriptors can be synchronized.
       mlir::func::FuncOp func =

diff  --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index ea7890c9aac52..eb2816145c77a 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -37,8 +37,8 @@ fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.
 func.func @_QPsub3() {
   %0 = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
   %1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
-  %2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
-  %3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  %2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
+  %3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
   return
 }
 
@@ -109,7 +109,7 @@ func.func @_QQsub6() attributes {fir.bindc_name = "test"} {
   %3 = fir.convert %c1 : (index) -> i64
   %4 = fir.convert %c10_i32 : (i32) -> i64
   fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-  %6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  %6 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
   return
 }
 
@@ -158,7 +158,7 @@ func.func @_QMmod1Pallocate_source_global() {
   %2 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
   %6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
   %7 = fir.load %6 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
-  %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+  %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
   return
 }
 
@@ -226,7 +226,7 @@ func.func @_QQpointer_sync() attributes {fir.bindc_name = "test"} {
   %3 = fir.convert %c1 : (index) -> i64
   %4 = fir.convert %c10_i32 : (i32) -> i64
   fir.call @_FortranAAllocatableSetBounds(%2, %c0_i32, %3, %4) fastmath<contract> : (!fir.ref<!fir.box<none>>, i32, i64, i64) -> ()
-  %6 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  %6 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
   return
 }
 
@@ -246,7 +246,7 @@ func.func @_QMmod1Ppointer_source_global() {
   %2 = fir.alloca !fir.box<!fir.ptr<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QMmod1Fallocate_source_globalEa"}
   %6 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Fallocate_source_globalEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
   %7 = fir.load %6 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
-  %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+  %21 = cuf.allocate %1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>> source(%7 : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
   return
 }
 

diff  --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 2cf8c7d336812..393faff6046bc 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -235,3 +235,21 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPcuda_component()
 ! CHECK: cuf.allocate
+
+subroutine module_allocate()
+  use globals
+  allocate(a_device(10))
+  allocate(a_managed(10))
+  allocate(a_pinned(10))
+  deallocate(a_device)
+  deallocate(a_managed)
+  deallocate(a_pinned)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPmodule_allocate() 
+! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
+! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
+! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32