[flang-commits] [flang] [flang][acc] Generate acc.copyout for the reduction clause on compute constructs (PR #144623)

Wed Jun 18 00:25:49 PDT 2025

https://github.com/khaki3 updated https://github.com/llvm/llvm-project/pull/144623

>From a509073944186f25f4dab18b474619cc3609da49 Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Tue, 17 Jun 2025 17:39:06 -0700
Subject: [PATCH 1/2] [flang][acc] Generate acc.copyout for the reduction
 clause on compute constructs

---
 flang/lib/Lower/OpenACC.cpp                   |  8 +++++++-
 .../acc-reduction-unwrap-defaultbounds.f90    | 20 ++++++++++++++++++-
 flang/test/Lower/OpenACC/acc-reduction.f90    | 20 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 69e9c53baa740..63a9d1d5616a9 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -2676,7 +2676,8 @@ static Op createComputeOp(
   llvm::SmallVector<mlir::Value> waitOperands, attachEntryOperands,
       copyEntryOperands, copyinEntryOperands, copyoutEntryOperands,
       createEntryOperands, nocreateEntryOperands, presentEntryOperands,
-      dataClauseOperands, numGangs, numWorkers, vectorLength, async;
+      reductionEntryOperands, dataClauseOperands, numGangs, numWorkers,
+      vectorLength, async;
   llvm::SmallVector<mlir::Attribute> numGangsDeviceTypes, numWorkersDeviceTypes,
       vectorLengthDeviceTypes, asyncDeviceTypes, asyncOnlyDeviceTypes,
       waitOperandsDeviceTypes, waitOnlyDeviceTypes;
@@ -2912,9 +2913,12 @@ static Op createComputeOp(
       // combined construct implies a copy clause so issue an implicit copy
       // instead.
       if (!combinedConstructs) {
+        auto crtDataStart = reductionOperands.size();
         genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
                       reductionOperands, reductionRecipes, async,
                       asyncDeviceTypes, asyncOnlyDeviceTypes);
+        reductionEntryOperands.append(reductionOperands.begin() + crtDataStart,
+                                      reductionOperands.end());
       } else {
         auto crtDataStart = dataClauseOperands.size();
         genDataOperandOperations<mlir::acc::CopyinOp>(
@@ -3038,6 +3042,8 @@ static Op createComputeOp(
       builder, nocreateEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::PresentOp, mlir::acc::DeleteOp>(
       builder, presentEntryOperands, /*structured=*/true);
+  genDataExitOperations<mlir::acc::ReductionOp, mlir::acc::CopyoutOp>(
+      builder, reductionEntryOperands, /*structured=*/true);
 
   builder.restoreInsertionPoint(insPt);
   return computeOp;
diff --git a/flang/test/Lower/OpenACC/acc-reduction-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-reduction-unwrap-defaultbounds.f90
index 5bb751678ed53..bb76122aaffac 100644
--- a/flang/test/Lower/OpenACC/acc-reduction-unwrap-defaultbounds.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction-unwrap-defaultbounds.f90
@@ -1001,6 +1001,7 @@ subroutine acc_reduction_iand()
 ! CHECK-LABEL: func.func @_QPacc_reduction_iand()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>)   -> !fir.ref<i32> {name = "i"}
 ! CHECK: acc.parallel   reduction(@reduction_iand_ref_i32 -> %[[RED]] : !fir.ref<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_ior()
   integer :: i
@@ -1011,6 +1012,7 @@ subroutine acc_reduction_ior()
 ! CHECK-LABEL: func.func @_QPacc_reduction_ior()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>)   -> !fir.ref<i32> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_ior_ref_i32 -> %[[RED]] : !fir.ref<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_ieor()
   integer :: i
@@ -1021,6 +1023,7 @@ subroutine acc_reduction_ieor()
 ! CHECK-LABEL: func.func @_QPacc_reduction_ieor()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_xor_ref_i32 -> %[[RED]] : !fir.ref<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_and()
   logical :: l
@@ -1033,6 +1036,7 @@ subroutine acc_reduction_and()
 ! CHECK: %[[DECLL:.*]]:2 = hlfir.declare %[[L]]
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECLL]]#0 : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_land_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_or()
   logical :: l
@@ -1043,6 +1047,7 @@ subroutine acc_reduction_or()
 ! CHECK-LABEL: func.func @_QPacc_reduction_or()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_lor_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_eqv()
   logical :: l
@@ -1053,6 +1058,7 @@ subroutine acc_reduction_eqv()
 ! CHECK-LABEL: func.func @_QPacc_reduction_eqv()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_eqv_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_neqv()
   logical :: l
@@ -1063,6 +1069,7 @@ subroutine acc_reduction_neqv()
 ! CHECK-LABEL: func.func @_QPacc_reduction_neqv()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_neqv_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_add_cmplx()
   complex :: c
@@ -1073,6 +1080,7 @@ subroutine acc_reduction_add_cmplx()
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_cmplx()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {name = "c"}
 ! CHECK: acc.parallel reduction(@reduction_add_ref_z32 -> %[[RED]] : !fir.ref<complex<f32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<complex<f32>>) to varPtr(%{{.*}} : !fir.ref<complex<f32>>) {dataClause = #acc<data_clause acc_reduction>, name = "c"}
 
 subroutine acc_reduction_mul_cmplx()
   complex :: c
@@ -1083,6 +1091,7 @@ subroutine acc_reduction_mul_cmplx()
 ! CHECK-LABEL: func.func @_QPacc_reduction_mul_cmplx()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {name = "c"}
 ! CHECK: acc.parallel reduction(@reduction_mul_ref_z32 -> %[[RED]] : !fir.ref<complex<f32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<complex<f32>>) to varPtr(%{{.*}} : !fir.ref<complex<f32>>) {dataClause = #acc<data_clause acc_reduction>, name = "c"}
 
 subroutine acc_reduction_add_alloc()
   integer, allocatable :: i
@@ -1098,6 +1107,7 @@ subroutine acc_reduction_add_alloc()
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.heap<i32>) -> !fir.heap<i32> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_add_heap_i32 -> %[[RED]] : !fir.heap<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.heap<i32>) to varPtr(%[[BOX_ADDR]] : !fir.heap<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_add_pointer(i)
   integer, pointer :: i
@@ -1112,6 +1122,7 @@ subroutine acc_reduction_add_pointer(i)
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[LOAD]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.ptr<i32>) -> !fir.ptr<i32> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_add_ptr_i32 -> %[[RED]] : !fir.ptr<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ptr<i32>) to varPtr(%[[BOX_ADDR]] : !fir.ptr<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_add_static_slice(a)
   integer :: a(100)
@@ -1129,6 +1140,7 @@ subroutine acc_reduction_add_static_slice(a)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[C100]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECLARG0]]#0 : !fir.ref<!fir.array<100xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xi32>> {name = "a(11:20)"}
 ! CHECK: acc.parallel reduction(@reduction_add_section_lb10.ub19_ref_100xi32 -> %[[RED]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.array<100xi32>>) bounds(%[[BOUND]]) to varPtr(%[[DECLARG0]]#0 : !fir.ref<!fir.array<100xi32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a(11:20)"}
 
 subroutine acc_reduction_add_dynamic_extent_add(a)
   integer :: a(:)
@@ -1141,6 +1153,7 @@ subroutine acc_reduction_add_dynamic_extent_add(a)
 ! CHECK: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_add_box_Uxi32 -> %[[RED:.*]] : !fir.ref<!fir.array<?xi32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) to varPtr(%{{.*}} : !fir.ref<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_add_assumed_shape_max(a)
   real :: a(:)
@@ -1153,6 +1166,7 @@ subroutine acc_reduction_add_assumed_shape_max(a)
 ! CHECK: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_Uxf32 -> %[[RED]] : !fir.ref<!fir.array<?xf32>>) {
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.array<?xf32>>) bounds(%{{.*}}) to varPtr(%{{.*}} : !fir.ref<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_add_dynamic_extent_add_with_section(a)
   integer :: a(:)
@@ -1167,6 +1181,7 @@ subroutine acc_reduction_add_dynamic_extent_add_with_section(a)
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECL]]#0 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xi32>> {name = "a(2:4)"}
 ! CHECK: acc.parallel reduction(@reduction_add_section_lb1.ub3_box_Uxi32 -> %[[RED]] : !fir.ref<!fir.array<?xi32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.array<?xi32>>) bounds(%[[BOUND]]) to varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a(2:4)"}
 
 subroutine acc_reduction_add_allocatable(a)
   real, allocatable :: a(:)
@@ -1180,8 +1195,9 @@ subroutine acc_reduction_add_allocatable(a)
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c0{{.*}} : index) upperbound(%{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}}#0 : index) {strideInBytes = true}
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
-! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>)   bounds(%{{[0-9]+}}) -> !fir.heap<!fir.array<?xf32>> {name = "a"}
+! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>)   bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_heap_Uxf32 -> %[[RED]] : !fir.heap<!fir.array<?xf32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) to varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_add_pointer_array(a)
   real, pointer :: a(:)
@@ -1197,6 +1213,7 @@ subroutine acc_reduction_add_pointer_array(a)
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.ptr<!fir.array<?xf32>>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ptr<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_ptr_Uxf32 -> %[[RED]] : !fir.ptr<!fir.array<?xf32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ptr<!fir.array<?xf32>>) bounds(%[[BOUND]]) to varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_max_dynamic_extent_max(a, n)
   integer :: n
@@ -1211,3 +1228,4 @@ subroutine acc_reduction_max_dynamic_extent_max(a, n)
 ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[DECL_A]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> !fir.ref<!fir.array<?x?xf32>>
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[ADDR]] : !fir.ref<!fir.array<?x?xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<?x?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_UxUxf32 -> %[[RED]] : !fir.ref<!fir.array<?x?xf32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.array<?x?xf32>>) bounds(%{{.*}}) to varPtr(%{{.*}} : !fir.ref<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index 20b5ad28f78a1..22a52739171b1 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -1042,6 +1042,7 @@ subroutine acc_reduction_iand()
 ! CHECK-LABEL: func.func @_QPacc_reduction_iand()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>)   -> !fir.ref<i32> {name = "i"}
 ! CHECK: acc.parallel   reduction(@reduction_iand_ref_i32 -> %[[RED]] : !fir.ref<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_ior()
   integer :: i
@@ -1052,6 +1053,7 @@ subroutine acc_reduction_ior()
 ! CHECK-LABEL: func.func @_QPacc_reduction_ior()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>)   -> !fir.ref<i32> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_ior_ref_i32 -> %[[RED]] : !fir.ref<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_ieor()
   integer :: i
@@ -1062,6 +1064,7 @@ subroutine acc_reduction_ieor()
 ! CHECK-LABEL: func.func @_QPacc_reduction_ieor()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_xor_ref_i32 -> %[[RED]] : !fir.ref<i32>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_and()
   logical :: l
@@ -1074,6 +1077,7 @@ subroutine acc_reduction_and()
 ! CHECK: %[[DECLL:.*]]:2 = hlfir.declare %[[L]]
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECLL]]#0 : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_land_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_or()
   logical :: l
@@ -1084,6 +1088,7 @@ subroutine acc_reduction_or()
 ! CHECK-LABEL: func.func @_QPacc_reduction_or()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_lor_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_eqv()
   logical :: l
@@ -1094,6 +1099,7 @@ subroutine acc_reduction_eqv()
 ! CHECK-LABEL: func.func @_QPacc_reduction_eqv()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_eqv_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_neqv()
   logical :: l
@@ -1104,6 +1110,7 @@ subroutine acc_reduction_neqv()
 ! CHECK-LABEL: func.func @_QPacc_reduction_neqv()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>> {name = "l"}
 ! CHECK: acc.parallel reduction(@reduction_neqv_ref_l32 -> %[[RED]] : !fir.ref<!fir.logical<4>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.logical<4>>) to varPtr(%{{.*}} : !fir.ref<!fir.logical<4>>) {dataClause = #acc<data_clause acc_reduction>, name = "l"}
 
 subroutine acc_reduction_add_cmplx()
   complex :: c
@@ -1114,6 +1121,7 @@ subroutine acc_reduction_add_cmplx()
 ! CHECK-LABEL: func.func @_QPacc_reduction_add_cmplx()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {name = "c"}
 ! CHECK: acc.parallel reduction(@reduction_add_ref_z32 -> %[[RED]] : !fir.ref<complex<f32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<complex<f32>>) to varPtr(%{{.*}} : !fir.ref<complex<f32>>) {dataClause = #acc<data_clause acc_reduction>, name = "c"}
 
 subroutine acc_reduction_mul_cmplx()
   complex :: c
@@ -1124,6 +1132,7 @@ subroutine acc_reduction_mul_cmplx()
 ! CHECK-LABEL: func.func @_QPacc_reduction_mul_cmplx()
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%{{.*}} : !fir.ref<complex<f32>>) -> !fir.ref<complex<f32>> {name = "c"}
 ! CHECK: acc.parallel reduction(@reduction_mul_ref_z32 -> %[[RED]] : !fir.ref<complex<f32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<complex<f32>>) to varPtr(%{{.*}} : !fir.ref<complex<f32>>) {dataClause = #acc<data_clause acc_reduction>, name = "c"}
 
 subroutine acc_reduction_add_alloc()
   integer, allocatable :: i
@@ -1137,6 +1146,7 @@ subroutine acc_reduction_add_alloc()
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOCA]]
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_add_ref_box_heap_i32 -> %[[RED]] : !fir.ref<!fir.box<!fir.heap<i32>>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.box<!fir.heap<i32>>>) to varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<i32>>>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_add_pointer(i)
   integer, pointer :: i
@@ -1149,6 +1159,7 @@ subroutine acc_reduction_add_pointer(i)
 ! CHECK: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECLARG0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>) -> !fir.ref<!fir.box<!fir.ptr<i32>>> {name = "i"}
 ! CHECK: acc.parallel reduction(@reduction_add_ref_box_ptr_i32 -> %[[RED]] : !fir.ref<!fir.box<!fir.ptr<i32>>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.box<!fir.ptr<i32>>>) to varPtr(%[[DECLARG0]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>) {dataClause = #acc<data_clause acc_reduction>, name = "i"}
 
 subroutine acc_reduction_add_static_slice(a)
   integer :: a(100)
@@ -1166,6 +1177,7 @@ subroutine acc_reduction_add_static_slice(a)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[C100]] : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECLARG0]]#0 : !fir.ref<!fir.array<100xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xi32>> {name = "a(11:20)"}
 ! CHECK: acc.parallel reduction(@reduction_add_section_lb10.ub19_ref_100xi32 -> %[[RED]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.array<100xi32>>) bounds(%[[BOUND]]) to varPtr(%[[DECLARG0]]#0 : !fir.ref<!fir.array<100xi32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a(11:20)"}
 
 subroutine acc_reduction_add_dynamic_extent_add(a)
   integer :: a(:)
@@ -1178,6 +1190,7 @@ subroutine acc_reduction_add_dynamic_extent_add(a)
 ! CHECK: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! CHECK: %[[RED:.*]] = acc.reduction var(%{{.*}} : !fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_add_box_Uxi32 -> %[[RED:.*]] : !fir.box<!fir.array<?xi32>>)
+! CHECK: acc.copyout accVar(%[[RED]] : !fir.box<!fir.array<?xi32>>) to var(%{{.*}} : !fir.box<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_add_assumed_shape_max(a)
   real :: a(:)
@@ -1190,6 +1203,7 @@ subroutine acc_reduction_add_assumed_shape_max(a)
 ! CHECK: %[[DECLARG0:.*]]:2 = hlfir.declare %[[ARG0]]
 ! CHECK: %[[RED:.*]] = acc.reduction var(%{{.*}} : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_Uxf32 -> %[[RED]] : !fir.box<!fir.array<?xf32>>) {
+! CHECK: acc.copyout accVar(%[[RED]] : !fir.box<!fir.array<?xf32>>) to var(%{{.*}} : !fir.box<!fir.array<?xf32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_add_dynamic_extent_add_with_section(a)
   integer :: a(:)
@@ -1202,7 +1216,8 @@ subroutine acc_reduction_add_dynamic_extent_add_with_section(a)
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_add_dynamic_extent_add_with_sectionEa"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%c1{{.*}} : index) upperbound(%c3{{.*}} : index) extent(%{{.*}}#1 : index) stride(%{{.*}}#2 : index) startIdx(%{{.*}} : index) {strideInBytes = true}
 ! CHECK: %[[RED:.*]] = acc.reduction var(%[[DECL]]#0 : !fir.box<!fir.array<?xi32>>) bounds(%[[BOUND]]) -> !fir.box<!fir.array<?xi32>> {name = "a(2:4)"}
-! CHECK: acc.parallel reduction(@reduction_add_section_lb1.ub3_box_Uxi32 -> %[[RED]] : !fir.box<!fir.array<?xi32>>)
+! CHECK: acc.parallel reduction(@reduction_add_section_lb1.ub3_box_Uxi32 -> %[[RED]] : !fir.box<!fir.array<?xi32>>
+! CHECK: acc.copyout accVar(%[[RED]] : !fir.box<!fir.array<?xi32>>) bounds(%[[BOUND]]) to var(%[[DECL]]#0 : !fir.box<!fir.array<?xi32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a(2:4)"}
 
 subroutine acc_reduction_add_allocatable(a)
   real, allocatable :: a(:)
@@ -1215,6 +1230,7 @@ subroutine acc_reduction_add_allocatable(a)
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFacc_reduction_add_allocatableEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_ref_box_heap_Uxf32 -> %[[RED]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) to varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_add_pointer_array(a)
   real, pointer :: a(:)
@@ -1227,6 +1243,7 @@ subroutine acc_reduction_add_pointer_array(a)
 ! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFacc_reduction_add_pointer_arrayEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
 ! CHECK: %[[RED:.*]] = acc.reduction varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_ref_box_ptr_Uxf32 -> %[[RED]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+! CHECK: acc.copyout accPtr(%[[RED]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) to varPtr(%[[DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}
 
 subroutine acc_reduction_max_dynamic_extent_max(a, n)
   integer :: n
@@ -1240,3 +1257,4 @@ subroutine acc_reduction_max_dynamic_extent_max(a, n)
 ! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) dummy_scope %{{[0-9]+}} {uniq_name = "_QFacc_reduction_max_dynamic_extent_maxEa"} : (!fir.ref<!fir.array<?x?xf32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.ref<!fir.array<?x?xf32>>)
 ! CHECK: %[[RED:.*]] = acc.reduction var(%[[DECL_A]]#0 : !fir.box<!fir.array<?x?xf32>>) -> !fir.box<!fir.array<?x?xf32>> {name = "a"}
 ! CHECK: acc.parallel reduction(@reduction_max_box_UxUxf32 -> %[[RED]] : !fir.box<!fir.array<?x?xf32>>)
+! CHECK: acc.copyout accVar(%[[RED]] : !fir.box<!fir.array<?x?xf32>>) to var(%[[DECL_A]]#0 : !fir.box<!fir.array<?x?xf32>>) {dataClause = #acc<data_clause acc_reduction>, name = "a"}

>From acbc6cdc22d4742cd3748dd0137168ba3124c0cc Mon Sep 17 00:00:00 2001
From: Kazuaki Matsumura <kmatsumura at nvidia.com>
Date: Wed, 18 Jun 2025 00:25:00 -0700
Subject: [PATCH 2/2] [flang][acc] Reorder the generation of data exit

---
 flang/lib/Lower/OpenACC.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 63a9d1d5616a9..3ad726b068bb4 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -3028,6 +3028,8 @@ static Op createComputeOp(
   builder.setInsertionPointAfter(computeOp);
 
   // Create the exit operations after the region.
+  genDataExitOperations<mlir::acc::ReductionOp, mlir::acc::CopyoutOp>(
+      builder, reductionEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::CopyoutOp>(
       builder, copyEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::CopyinOp, mlir::acc::DeleteOp>(
@@ -3042,8 +3044,6 @@ static Op createComputeOp(
       builder, nocreateEntryOperands, /*structured=*/true);
   genDataExitOperations<mlir::acc::PresentOp, mlir::acc::DeleteOp>(
       builder, presentEntryOperands, /*structured=*/true);
-  genDataExitOperations<mlir::acc::ReductionOp, mlir::acc::CopyoutOp>(
-      builder, reductionEntryOperands, /*structured=*/true);
 
   builder.restoreInsertionPoint(insPt);
   return computeOp;