[flang-commits] [flang] [llvm] [flang][DoConcurrent] Map reduction variables as tofrom ByRef for device offloading (PR #189378)

Kareem Ergawy via flang-commits flang-commits at lists.llvm.org
Mon Mar 30 06:27:38 PDT 2026


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/189378

>From 79813463722e940e3603b32e9f51e192143d0e50 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 30 Mar 2026 06:19:51 -0500
Subject: [PATCH] [flang][DoConcurrent] Map reduction variables as tofrom ByRef
 for device offloading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scalar reduction variables in `do concurrent reduce(...)` were being
mapped with `implicit ByCopy` when offloaded to device, because
`genMapInfoOpForLiveIn` treated all trivial types uniformly. This caused
the reduction result to be silently dropped — the device-side reduction
would compute the correct value but never write it back to the host.

Fix by detecting reduction variables and forcing `implicit tofrom ByRef`
mapping, matching the behavior of explicit
`!$omp target teams distribute parallel do reduction(...)`.

Co-authored-by: ergawy <kareem.ergawy at amd.com>
Co-authored-by: Claude <noreply at anthropic.com>
Made-with: Cursor
---
 .../OpenMP/DoConcurrentConversion.cpp         | 14 ++++--
 .../DoConcurrent/reduce_device.mlir           |  1 +
 .../DoConcurrent/reduce_device_min.f90        | 45 +++++++++++++++++
 .../do-concurrent-to-omp-min-reduce.f90       | 48 +++++++++++++++++++
 4 files changed, 104 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/reduce_device_min.f90
 create mode 100644 offload/test/offloading/fortran/do-concurrent-to-omp-min-reduce.f90

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 876a54d29837e..5066c480141d0 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -313,8 +313,10 @@ class DoConcurrentConversion
           fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
 
       for (mlir::Value liveIn : loopNestLiveIns) {
+        bool isReductionVar = llvm::find(loop.getReduceVars(), liveIn) !=
+                              loop.getReduceVars().end();
         targetClauseOps.mapVars.push_back(
-            genMapInfoOpForLiveIn(builder, liveIn));
+            genMapInfoOpForLiveIn(builder, liveIn, isReductionVar));
         liveInShapeInfoMap.insert(
             {liveIn, TargetDeclareShapeCreationInfo(liveIn)});
       }
@@ -540,8 +542,9 @@ class DoConcurrentConversion
         /*dataExvIsAssumedSize=*/false, rawAddr.getLoc());
   }
 
-  mlir::omp::MapInfoOp genMapInfoOpForLiveIn(fir::FirOpBuilder &builder,
-                                             mlir::Value liveIn) const {
+  mlir::omp::MapInfoOp
+  genMapInfoOpForLiveIn(fir::FirOpBuilder &builder, mlir::Value liveIn,
+                        bool isReductionVar = false) const {
     mlir::Value rawAddr = liveIn;
     llvm::StringRef name;
 
@@ -574,7 +577,10 @@ class DoConcurrentConversion
     mlir::omp::VariableCaptureKind captureKind =
         mlir::omp::VariableCaptureKind::ByRef;
 
-    if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
+    if (isReductionVar) {
+      mapFlag |= mlir::omp::ClauseMapFlags::to;
+      mapFlag |= mlir::omp::ClauseMapFlags::from;
+    } else if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
       captureKind = mlir::omp::VariableCaptureKind::ByCopy;
     } else if (!fir::isa_builtin_cptr_type(eleType)) {
       mapFlag |= mlir::omp::ClauseMapFlags::to;
diff --git a/flang/test/Transforms/DoConcurrent/reduce_device.mlir b/flang/test/Transforms/DoConcurrent/reduce_device.mlir
index 3e46692a15dca..c6456fe70dd27 100644
--- a/flang/test/Transforms/DoConcurrent/reduce_device.mlir
+++ b/flang/test/Transforms/DoConcurrent/reduce_device.mlir
@@ -36,6 +36,7 @@ func.func @_QPfoo() {
 
 // CHECK: %[[S_DECL:.*]]:2 = hlfir.declare %6 {uniq_name = "_QFfooEs"}
 // CHECK: %[[S_MAP:.*]] = omp.map.info var_ptr(%[[S_DECL]]#1
+// CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef)
 
 // CHECK: omp.target host_eval({{.*}}) map_entries({{.*}}, %[[S_MAP]] -> %[[S_TARGET_ARG:.*]] : {{.*}}) {
 // CHECK:   %[[S_DEV_DECL:.*]]:2 = hlfir.declare %[[S_TARGET_ARG]]
diff --git a/flang/test/Transforms/DoConcurrent/reduce_device_min.f90 b/flang/test/Transforms/DoConcurrent/reduce_device_min.f90
new file mode 100644
index 0000000000000..509207c1db2a8
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/reduce_device_min.f90
@@ -0,0 +1,45 @@
+! Tests that a `do concurrent reduce(min:...)` on a scalar maps the reduction
+! variable as `tofrom ByRef` (not `ByCopy`) when targeting a device. This is
+! needed so the reduced result is written back from the device to the host.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+subroutine min_reduce(arr, n, min_val)
+    implicit none
+    integer, intent(in) :: n
+    real, intent(in) :: arr(n)
+    real :: min_val
+    integer :: i
+
+    do concurrent (i=1:n) reduce(min:min_val)
+        min_val = min(min_val, arr(i))
+    end do
+end subroutine min_reduce
+
+! CHECK-DAG: omp.declare_reduction @[[RED_SYM:.*\.omp]] : f32 init
+
+! CHECK-LABEL: func.func @_QPmin_reduce
+
+! CHECK: %[[MIN_VAL_DECL:.*]]:2 = hlfir.declare %{{.*}} dummy_scope %{{.*}} {uniq_name = "_QFmin_reduceEmin_val"}
+
+! Verify the reduction variable is mapped tofrom + ByRef (not implicit + ByCopy).
+! CHECK: %[[MIN_VAL_MAP:.*]] = omp.map.info var_ptr(%[[MIN_VAL_DECL]]#1
+! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef)
+! CHECK-SAME: -> !fir.ref<f32> {name = "_QFmin_reduceEmin_val"}
+
+! CHECK: omp.target
+! CHECK-SAME: map_entries({{.*}}%[[MIN_VAL_MAP]] -> %[[MIN_VAL_ARG:[[:alnum:]]+]]{{.*}})
+
+! CHECK: %[[MIN_VAL_DEV:.*]]:2 = hlfir.declare %[[MIN_VAL_ARG]] {{.*}} "_QFmin_reduceEmin_val"
+! CHECK: omp.teams reduction(@[[RED_SYM]] %[[MIN_VAL_DEV]]#0 -> %[[RED_TEAMS:.*]] : !fir.ref<f32>) {
+! CHECK:   omp.parallel {
+! CHECK:     omp.distribute {
+! CHECK:       omp.wsloop reduction(@[[RED_SYM]] %[[RED_TEAMS]] -> %[[RED_WS:.*]] : !fir.ref<f32>) {
+! CHECK:         omp.loop_nest
+! CHECK:       } {omp.composite}
+! CHECK:     } {omp.composite}
+! CHECK:   } {omp.composite}
+! CHECK: }
diff --git a/offload/test/offloading/fortran/do-concurrent-to-omp-min-reduce.f90 b/offload/test/offloading/fortran/do-concurrent-to-omp-min-reduce.f90
new file mode 100644
index 0000000000000..d54f75e4714cb
--- /dev/null
+++ b/offload/test/offloading/fortran/do-concurrent-to-omp-min-reduce.f90
@@ -0,0 +1,48 @@
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic -fdo-concurrent-to-openmp=device
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+module min_reduce_mod
+   implicit none
+   public :: min_reduce
+contains
+
+subroutine min_reduce(arr, min_val, n)
+   implicit none
+   integer, intent(in) :: n
+   real, dimension(:), intent(in) :: arr
+   real :: min_val
+   integer :: i
+
+   do concurrent(i=1:n) reduce(min:min_val)
+       min_val = min(min_val, arr(i))
+   end do
+
+   print *, 'min_val after reduction =', min_val
+end subroutine min_reduce
+
+end module min_reduce_mod
+
+program main
+   use min_reduce_mod, only: min_reduce
+   implicit none
+
+   integer, parameter :: n = 10
+   real :: arr(n)
+   real :: min_val
+
+   arr = (/ 200.0, 150.0, 80.0, 50.0, 300.0, 25.0, 175.0, 60.0, 400.0, 90.0 /)
+   min_val = 100.0
+
+   call min_reduce(arr, min_val, n)
+
+   if (min_val == 25.0) then
+       print *, 'PASS'
+   else
+       print *, 'FAIL: expected 25.0, got', min_val
+   end if
+end program main
+
+! CHECK:  PluginInterface device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  PASS



More information about the flang-commits mailing list