[flang] [llvm] [flang][cuda] Add support for allocate with device source (PR #171743)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 10 16:02:41 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-flang-fir-hlfir
Author: Valentin Clement (バレンタイン クレメン) (clementval)
<details>
<summary>Changes</summary>
Add support for allocate statement with a source that is a device variable.
---
Full diff: https://github.com/llvm/llvm-project/pull/171743.diff
10 Files Affected:
- (modified) flang-rt/lib/cuda/allocatable.cpp (+17-8)
- (modified) flang-rt/lib/cuda/pointer.cpp (+8-6)
- (modified) flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td (+3-2)
- (modified) flang/include/flang/Runtime/CUDA/allocatable.h (+4-2)
- (modified) flang/include/flang/Runtime/CUDA/pointer.h (+4-2)
- (modified) flang/lib/Lower/Allocatable.cpp (+10-3)
- (modified) flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp (+4-2)
- (modified) flang/test/Fir/CUDA/cuda-allocate.fir (+5-2)
- (removed) flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf (-9)
- (modified) flang/test/Lower/CUDA/cuda-allocatable.cuf (+9)
``````````diff
diff --git a/flang-rt/lib/cuda/allocatable.cpp b/flang-rt/lib/cuda/allocatable.cpp
index ff1a225d66ce9..c48b71f1dea16 100644
--- a/flang-rt/lib/cuda/allocatable.cpp
+++ b/flang-rt/lib/cuda/allocatable.cpp
@@ -57,26 +57,35 @@ int RTDEF(CUFAllocatableAllocate)(Descriptor &desc, int64_t *stream,
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+ bool sourceIsDevice) {
int stat{RTNAME(CUFAllocatableAllocate)(
alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
- Fortran::runtime::DoFromSourceAssign(
- alloc, source, terminator, &MemmoveHostToDevice);
+ Fortran::runtime::DoFromSourceAssign(alloc, source, terminator,
+ sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
}
return stat;
}
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
- int stat{RTNAME(CUFAllocatableAllocateSync)(
- alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+ bool sourceIsDevice) {
+ int stat;
+ printf("%d\n", sourceIsDevice);
+ if (sourceIsDevice) {
+ stat = RTNAME(CUFAllocatableAllocate)(
+ alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine);
+ } else {
+ stat = RTNAME(CUFAllocatableAllocateSync)(
+ alloc, stream, pinned, hasStat, errMsg, sourceFile, sourceLine);
+ }
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
- Fortran::runtime::DoFromSourceAssign(
- alloc, source, terminator, &MemmoveHostToDevice);
+ Fortran::runtime::DoFromSourceAssign(alloc, source, terminator,
+ sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
}
return stat;
}
diff --git a/flang-rt/lib/cuda/pointer.cpp b/flang-rt/lib/cuda/pointer.cpp
index d3f5cfe8e96a1..f07b1a9b60924 100644
--- a/flang-rt/lib/cuda/pointer.cpp
+++ b/flang-rt/lib/cuda/pointer.cpp
@@ -56,26 +56,28 @@ int RTDEF(CUFPointerAllocateSync)(Descriptor &desc, int64_t *stream,
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+ bool sourceIsDevice) {
int stat{RTNAME(CUFPointerAllocate)(
pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
- Fortran::runtime::DoFromSourceAssign(
- pointer, source, terminator, &MemmoveHostToDevice);
+ Fortran::runtime::DoFromSourceAssign(pointer, source, terminator,
+ sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
}
return stat;
}
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
const Descriptor &source, int64_t *stream, bool *pinned, bool hasStat,
- const Descriptor *errMsg, const char *sourceFile, int sourceLine) {
+ const Descriptor *errMsg, const char *sourceFile, int sourceLine,
+ bool sourceIsDevice) {
int stat{RTNAME(CUFPointerAllocateSync)(
pointer, stream, pinned, hasStat, errMsg, sourceFile, sourceLine)};
if (stat == StatOk) {
Terminator terminator{sourceFile, sourceLine};
- Fortran::runtime::DoFromSourceAssign(
- pointer, source, terminator, &MemmoveHostToDevice);
+ Fortran::runtime::DoFromSourceAssign(pointer, source, terminator,
+ sourceIsDevice ? &MemmoveDeviceToHost : &MemmoveHostToDevice);
}
return stat;
}
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 636879f28a2fb..34ac21c51b933 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -100,8 +100,9 @@ def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
Optional<fir_ReferenceType>:$stream,
Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
- cuf_DataAttributeAttr:$data_attr, UnitAttr:$hasStat,
- UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer);
+ OptionalAttr<cuf_DataAttributeAttr>:$data_attr, UnitAttr:$hasStat,
+ UnitAttr:$hasDoubleDescriptor, UnitAttr:$pointer,
+ UnitAttr:$device_source);
let results = (outs AnyIntegerType:$stat);
diff --git a/flang/include/flang/Runtime/CUDA/allocatable.h b/flang/include/flang/Runtime/CUDA/allocatable.h
index 6c97afa9e10e8..97f24bc34bfb8 100644
--- a/flang/include/flang/Runtime/CUDA/allocatable.h
+++ b/flang/include/flang/Runtime/CUDA/allocatable.h
@@ -34,14 +34,16 @@ int RTDECL(CUFAllocatableAllocateSync)(Descriptor &, int64_t *stream = nullptr,
int RTDEF(CUFAllocatableAllocateSource)(Descriptor &alloc,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const char *sourceFile = nullptr, int sourceLine = 0,
+ bool sourceIsDevice = false);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFAllocatableAllocateSourceSync)(Descriptor &alloc,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const char *sourceFile = nullptr, int sourceLine = 0,
+ bool sourceIsDevice = false);
/// Perform deallocation of the descriptor with synchronization of it when
/// necessary.
diff --git a/flang/include/flang/Runtime/CUDA/pointer.h b/flang/include/flang/Runtime/CUDA/pointer.h
index bdfc3268e0814..b845fd59114d4 100644
--- a/flang/include/flang/Runtime/CUDA/pointer.h
+++ b/flang/include/flang/Runtime/CUDA/pointer.h
@@ -34,14 +34,16 @@ int RTDECL(CUFPointerAllocateSync)(Descriptor &, int64_t *stream = nullptr,
int RTDEF(CUFPointerAllocateSource)(Descriptor &pointer,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const char *sourceFile = nullptr, int sourceLine = 0,
+ bool sourceIsDevice = false);
/// Perform allocation of the descriptor with synchronization of it when
/// necessary. Assign data from source.
int RTDEF(CUFPointerAllocateSourceSync)(Descriptor &pointer,
const Descriptor &source, int64_t *stream = nullptr, bool *pinned = nullptr,
bool hasStat = false, const Descriptor *errMsg = nullptr,
- const char *sourceFile = nullptr, int sourceLine = 0);
+ const char *sourceFile = nullptr, int sourceLine = 0,
+ bool sourceIsDevice = false);
} // extern "C"
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index c9a9d935bd615..030439550cd15 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -629,9 +629,10 @@ class AllocateStmtHelper {
unsigned allocatorIdx = Fortran::lower::getAllocatorIdx(alloc.getSymbol());
fir::ExtendedValue exv = isSource ? sourceExv : moldExv;
+ bool sourceIsDevice = false;
if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)})
if (Fortran::semantics::IsCUDADevice(*sym))
- TODO(loc, "CUDA Fortran: allocate with device source");
+ sourceIsDevice = true;
// Generate a sequence of runtime calls.
errorManager.genStatCheck(builder, loc);
@@ -651,7 +652,7 @@ class AllocateStmtHelper {
genSetDeferredLengthParameters(alloc, box);
genAllocateObjectBounds(alloc, box);
mlir::Value stat;
- if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
+ if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()) || sourceIsDevice) {
stat =
genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
} else {
@@ -798,13 +799,19 @@ class AllocateStmtHelper {
// Keep return type the same as a standard AllocatableAllocate call.
mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+ bool isSourceDevice = false;
+ if (const Fortran::semantics::Symbol *sym{GetLastSymbol(sourceExpr)})
+ if (Fortran::semantics::IsCUDADevice(*sym))
+ isSourceDevice = true;
+
bool doubleDescriptors = Fortran::lower::hasDoubleDescriptor(box.getAddr());
return cuf::AllocateOp::create(
builder, loc, retTy, box.getAddr(), errmsg, stream, pinned,
source, cudaAttr,
errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr,
doubleDescriptors ? builder.getUnitAttr() : nullptr,
- box.isPointer() ? builder.getUnitAttr() : nullptr)
+ box.isPointer() ? builder.getUnitAttr() : nullptr,
+ isSourceDevice ? builder.getUnitAttr() : nullptr)
.getResult();
}
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
index 6579c2362cd87..4444fc61239ea 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAllocationConversion.cpp
@@ -99,7 +99,6 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
mlir::Value hasStat = op.getHasStat() ? builder.createBool(loc, true)
: builder.createBool(loc, false);
-
mlir::Value errmsg;
if (op.getErrmsg()) {
errmsg = op.getErrmsg();
@@ -116,12 +115,15 @@ static mlir::LogicalResult convertOpToCall(OpTy op,
loc, fir::ReferenceType::get(
mlir::IntegerType::get(op.getContext(), 1)));
if (op.getSource()) {
+ mlir::Value isDeviceSource = op.getDeviceSource()
+ ? builder.createBool(loc, true)
+ : builder.createBool(loc, false);
mlir::Value stream =
op.getStream() ? op.getStream()
: builder.createNullConstant(loc, fTy.getInput(2));
args = fir::runtime::createArguments(
builder, loc, fTy, op.getBox(), op.getSource(), stream, pinned,
- hasStat, errmsg, sourceFile, sourceLine);
+ hasStat, errmsg, sourceFile, sourceLine, isDeviceSource);
} else {
mlir::Value stream =
op.getStream() ? op.getStream()
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 9d0d181609ada..5184561a03e67 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -128,11 +128,14 @@ func.func @_QPallocate_source() {
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%0 = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
+ %devsource = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a", uniq_name = "_QFallocate_sourceEa"}
%4 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%5 = cuf.alloc !fir.box<!fir.heap<!fir.array<?x?xf32>>> {bindc_name = "a_d", data_attr = #cuf.cuda<device>, uniq_name = "_QFallocate_sourceEa_d"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%7 = fir.declare %5 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFallocate_sourceEa_d"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%8 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
%22 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%8 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
+ %9 = fir.load %devsource : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
+ %23 = cuf.allocate %7 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>> source(%9 : !fir.box<!fir.heap<!fir.array<?x?xf32>>>) {device_source} -> i32
return
}
@@ -142,8 +145,8 @@ func.func @_QPallocate_source() {
// CHECK: %[[SOURCE:.*]] = fir.load %[[DECL_HOST]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>
// CHECK: %[[DEV_CONV:.*]] = fir.convert %[[DECL_DEV]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf32>>>>) -> !fir.ref<!fir.box<none>>
// CHECK: %[[SOURCE_CONV:.*]] = fir.convert %[[SOURCE]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.box<none>
-// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
-
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%[[DEV_CONV]], %[[SOURCE_CONV]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i64>, !fir.ref<i1>, i1, !fir.box<none>, !fir.ref<i8>, i32, i1) -> i32
+// CHECK: %{{.*}} = fir.call @_FortranACUFAllocatableAllocateSource(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %true{{.*}})
fir.global @_QMmod1Ea_d {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?x?xf32>>> {
%c0 = arith.constant 0 : index
diff --git a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf b/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
deleted file mode 100644
index 3e59e2f01119e..0000000000000
--- a/flang/test/Lower/CUDA/TODO/cuda-allocate-source-device.cuf
+++ /dev/null
@@ -1,9 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fcuda -o - %s 2>&1 | FileCheck %s
-
-program main
- implicit none
- integer, device, allocatable :: a_d(:)
- integer, allocatable :: a(:)
-! CHECK: not yet implemented: CUDA Fortran: allocate with device source
- allocate(a, source=a_d)
-end program
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 43e716532ecca..52303d126b8dc 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -261,3 +261,12 @@ end subroutine
! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>, hasDoubleDescriptor} -> i32
! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasDoubleDescriptor} -> i32
! CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
+
+attributes(global) subroutine from_device_source()
+ real, device, allocatable :: a(:)
+ real, allocatable :: b(:)
+ allocate(b, source=a)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPfrom_device_source()
+! CHECK: cuf.allocate{{.*}}device_source
``````````
</details>
https://github.com/llvm/llvm-project/pull/171743
More information about the llvm-commits
mailing list