[flang-commits] [flang] [flang][OpenMP][CUDA] Place privatized device allocatable descriptors in managed memory (PR #187114)

Zhen Wang via flang-commits flang-commits at lists.llvm.org
Tue Mar 17 14:10:52 PDT 2026


https://github.com/wangzpgi updated https://github.com/llvm/llvm-project/pull/187114

>From a8f84d5f6bc05235ebc37abdbc5c02cc6f6498d8 Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Tue, 17 Mar 2026 12:58:02 -0700
Subject: [PATCH 1/3] place descriptor for device data in managed memory

---
 .../Lower/Support/PrivateReductionUtils.cpp   | 26 +++++++++++++++++++
 ...elayed-privatization-cuda-device-array.cuf | 18 +++++++++----
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 551d8bae41fd4..5b99b3140b480 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -115,6 +115,16 @@ static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
     fir::FreeMemOp::create(builder, loc, cast);
 
     builder.setInsertionPointAfter(ifOp);
+    // Free the managed descriptor if this is a CUDA device allocatable.
+    if (sym) {
+      unsigned idx = Fortran::lower::getAllocatorIdx(sym->GetUltimate());
+      if (idx != kDefaultAllocator) {
+        cuf::DataAttributeAttr dataAttr =
+            Fortran::lower::translateSymbolCUFDataAttribute(
+                builder.getContext(), sym->GetUltimate());
+        cuf::FreeOp::create(builder, loc, block->getArgument(0), dataAttr);
+      }
+    }
     if (isDoConcurrent)
       fir::YieldOp::create(builder, loc);
     else
@@ -665,6 +675,22 @@ void PopulateInitAndCleanupRegionsHelper::populateByRefInitAndCleanupRegions() {
   if (auto boxTy = mlir::dyn_cast_or_null<fir::BaseBoxType>(valTy)) {
     builder.setInsertionPointToEnd(initBlock);
 
+    // For CUDA device allocatables, allocate the descriptor in managed
+    // memory so that CUF kernels can access it from the GPU.
+    if (sym && mlir::isa<fir::HeapType>(boxTy.getEleTy())) {
+      unsigned idx = Fortran::lower::getAllocatorIdx(sym->GetUltimate());
+      if (idx != kDefaultAllocator) {
+        cuf::DataAttributeAttr dataAttr =
+            Fortran::lower::translateSymbolCUFDataAttribute(
+                builder.getContext(), sym->GetUltimate());
+        auto managedDesc = cuf::AllocOp::create(
+            builder, loc, valTy, /*uniq_name=*/llvm::StringRef{},
+            /*bindc_name=*/llvm::StringRef{}, dataAttr,
+            /*typeparams=*/mlir::ValueRange{}, /*shape=*/mlir::ValueRange{});
+        allocatedPrivVarArg = managedDesc.getResult();
+      }
+    }
+
     // TODO: don't do this unless it is needed
     getLengthParameters(builder, loc, getLoadedMoldArg(), lenParams);
 
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
index cb9f9ea58d4cd..941e682c77068 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
+++ b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
@@ -1,6 +1,9 @@
-! Test that OpenMP privatization of CUDA Fortran allocatable device arrays
-! sets allocator_idx = 2 on the null descriptor so that user-written
-! allocate() (after cudaSetDevice) uses cudaMalloc on the correct GPU.
+! Test that OpenMP privatization of CUDA Fortran allocatable device arrays:
+! 1. Allocates the descriptor in managed memory (cuf.alloc) so CUF kernels
+!    can access it from the GPU.
+! 2. Sets allocator_idx = 2 on the null descriptor so that allocate() uses
+!    cudaMalloc.
+! 3. Frees the managed descriptor in the dealloc region (cuf.free).
 
 ! RUN: bbc -emit-hlfir -fcuda -fopenmp %s -o - | FileCheck %s
 
@@ -17,8 +20,13 @@ end subroutine
 
 ! CHECK-LABEL: omp.private {type = private}
 ! CHECK-SAME: @{{.*}} : !fir.box<!fir.heap<!fir.array<?xf64>>> init {
-! Null descriptor must carry allocator_idx = 2 so that a later
-! allocate() inside the parallel region calls cudaMalloc, not malloc.
+! Descriptor must be allocated in managed memory for GPU accessibility.
+! CHECK:          cuf.alloc !fir.box<!fir.heap<!fir.array<?xf64>>> {data_attr = #cuf.cuda<device>}
+! Null descriptor must carry allocator_idx = 2.
 ! CHECK:          fir.embox %{{.*}}(%{{.*}}) {allocator_idx = 2 : i32}
 ! CHECK:          fir.store
+! CHECK:        } dealloc {
+! Managed descriptor must be freed.
+! CHECK:          cuf.free %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>> {data_attr = #cuf.cuda<device>}
+! CHECK:        }
 ! CHECK-LABEL: func.func

>From 9ad00f21ac8e04e9222c6e0e08dd45f14620e033 Mon Sep 17 00:00:00 2001
From: Zhen Wang <37195552+wangzpgi at users.noreply.github.com>
Date: Tue, 17 Mar 2026 13:46:05 -0700
Subject: [PATCH 2/3] Update flang/lib/Lower/Support/PrivateReductionUtils.cpp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Valentin Clement (バレンタイン クレメン) <clementval at gmail.com>
---
 flang/lib/Lower/Support/PrivateReductionUtils.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 5b99b3140b480..93375b7892427 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -683,11 +683,10 @@ void PopulateInitAndCleanupRegionsHelper::populateByRefInitAndCleanupRegions() {
         cuf::DataAttributeAttr dataAttr =
             Fortran::lower::translateSymbolCUFDataAttribute(
                 builder.getContext(), sym->GetUltimate());
-        auto managedDesc = cuf::AllocOp::create(
+        allocatedPrivVarArg = cuf::AllocOp::create(
             builder, loc, valTy, /*uniq_name=*/llvm::StringRef{},
             /*bindc_name=*/llvm::StringRef{}, dataAttr,
-            /*typeparams=*/mlir::ValueRange{}, /*shape=*/mlir::ValueRange{});
-        allocatedPrivVarArg = managedDesc.getResult();
+            /*typeparams=*/mlir::ValueRange{}, /*shape=*/mlir::ValueRange{}).getResult();
       }
     }
 

>From 73178cd0b185cb649260f96547e910cbf22bfecf Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw at nvidia.com>
Date: Tue, 17 Mar 2026 14:01:01 -0700
Subject: [PATCH 3/3] format

---
 flang/lib/Lower/Support/PrivateReductionUtils.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index 93375b7892427..aae433c023d01 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -683,10 +683,13 @@ void PopulateInitAndCleanupRegionsHelper::populateByRefInitAndCleanupRegions() {
         cuf::DataAttributeAttr dataAttr =
             Fortran::lower::translateSymbolCUFDataAttribute(
                 builder.getContext(), sym->GetUltimate());
-        allocatedPrivVarArg = cuf::AllocOp::create(
-            builder, loc, valTy, /*uniq_name=*/llvm::StringRef{},
-            /*bindc_name=*/llvm::StringRef{}, dataAttr,
-            /*typeparams=*/mlir::ValueRange{}, /*shape=*/mlir::ValueRange{}).getResult();
+        allocatedPrivVarArg =
+            cuf::AllocOp::create(builder, loc, valTy,
+                                 /*uniq_name=*/llvm::StringRef{},
+                                 /*bindc_name=*/llvm::StringRef{}, dataAttr,
+                                 /*typeparams=*/mlir::ValueRange{},
+                                 /*shape=*/mlir::ValueRange{})
+                .getResult();
       }
     }
 



More information about the flang-commits mailing list