[flang] [llvm] [Flang][OpenMP] Appropriately emit present/load/store in all cases in MapInfoFinalization (PR #150311)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 25 02:15:25 PDT 2025


https://github.com/agozillon updated https://github.com/llvm/llvm-project/pull/150311

>From 3b4c5ac14402a2a9006e646980a4bca58a54666c Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon at amd.com>
Date: Wed, 23 Jul 2025 10:16:08 -0500
Subject: [PATCH 1/2] [Flang][OpenMP] Appropriately emit present/load/store in
 all cases in MapInfoFinalization

Currently, we return early whenever we've already generated an allocation for intermediate
descriptor variables (required in certain cases when we can't directly access the base
address of a passes in descriptor function argument due to HLFIR/FIR restrictions). This
unfortunately, skips over the presence check and load/store required to set the
intermediate descriptor allocations values/data. This is fine in most cases, but if a
function happens to have a series of branches with seperate target regions capturing
the same input argument, we'd emit the present/load/store into the first branch with
the first target inside of it, the secondary (or any preceding) branches would not
have the present/load/store, this would lead to the subsequent mapped values in that
branch being empty and then leading to a memory access violation on device.

The fix for the moment is to emit a present/load/store at the relevant location
of every target utilising the input argument, this likely will also lead to fixing
possible issues with the input argument being manipulated inbetween target regions
(primarily resizing, the data should remain the same as we're just copying an address
around, in theory at least). There's possible optimizations/simplifications to emit
less load/stores such as by raising the load/store out of the branches when we can,
but I'm inclined to leave this sort of optimization to lower level passes such as
an LLVM pass (which very possibly already covers it).
---
 .../Optimizer/OpenMP/MapInfoFinalization.cpp  | 53 +++++++++++--------
 .../Lower/OpenMP/optional-argument-map-3.f90  | 46 ++++++++++++++++
 .../fortran/optional-mapped-arguments-3.f90   | 45 ++++++++++++++++
 3 files changed, 122 insertions(+), 22 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/optional-argument-map-3.f90
 create mode 100644 offload/test/offloading/fortran/optional-mapped-arguments-3.f90

diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 03ca3d800f6b9..57be863cfa1b8 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -137,32 +137,41 @@ class MapInfoFinalizationPass
         !fir::factory::isOptionalArgument(descriptor.getDefiningOp()))
       return descriptor;
 
-    mlir::Value &slot = localBoxAllocas[descriptor.getDefiningOp()];
-    if (slot) {
-      return slot;
+    mlir::Value &alloca = localBoxAllocas[descriptor.getDefiningOp()];
+    mlir::Location loc = boxMap->getLoc();
+
+    if (!alloca) {
+      // The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
+      // allowing it to access non-reference box operations can cause some
+      // problematic SSA IR. However, in the case of assumed shape's the type
+      // is not a !fir.ref, in these cases to retrieve the appropriate
+      // !fir.ref<!fir.box<...>> to access the data we need to map we must
+      // perform an alloca and then store to it and retrieve the data from the
+      // new alloca.
+      mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
+      mlir::Block *allocaBlock = builder.getAllocaBlock();
+      assert(allocaBlock && "No alloca block found for this top level op");
+      builder.setInsertionPointToStart(allocaBlock);
+
+      mlir::Type allocaType = descriptor.getType();
+      if (fir::isBoxAddress(allocaType))
+        allocaType = fir::unwrapRefType(allocaType);
+      alloca = fir::AllocaOp::create(builder, loc, allocaType);
+      builder.restoreInsertionPoint(insPt);
     }
 
-    // The fir::BoxOffsetOp only works with !fir.ref<!fir.box<...>> types, as
-    // allowing it to access non-reference box operations can cause some
-    // problematic SSA IR. However, in the case of assumed shape's the type
-    // is not a !fir.ref, in these cases to retrieve the appropriate
-    // !fir.ref<!fir.box<...>> to access the data we need to map we must
-    // perform an alloca and then store to it and retrieve the data from the new
-    // alloca.
-    mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
-    mlir::Block *allocaBlock = builder.getAllocaBlock();
-    mlir::Location loc = boxMap->getLoc();
-    assert(allocaBlock && "No alloca block found for this top level op");
-    builder.setInsertionPointToStart(allocaBlock);
-
-    mlir::Type allocaType = descriptor.getType();
-    if (fir::isBoxAddress(allocaType))
-      allocaType = fir::unwrapRefType(allocaType);
-    auto alloca = fir::AllocaOp::create(builder, loc, allocaType);
-    builder.restoreInsertionPoint(insPt);
     // We should only emit a store if the passed in data is present, it is
     // possible a user passes in no argument to an optional parameter, in which
     // case we cannot store or we'll segfault on the emitted memcpy.
+    // TODO: We currently emit a present -> load/store every time we use a
+    // mapped value that requires a local allocation, this isn't the most
+    // efficient, although, it is more correct in a lot of situations. One
+    // such situation is emitting a this series of instructions in separate
+    // segments of a branch (e.g. two target regions in separate else/if branch
+    // mapping the same function argument), however, it would be nice to be able
+    // to optimize these situations e.g. raising the load/store out of the
+    // branch if possible. But perhaps this is best left to lower level
+    // optimisation passes.
     auto isPresent =
         fir::IsPresentOp::create(builder, loc, builder.getI1Type(), descriptor);
     builder.genIfOp(loc, {}, isPresent, false)
@@ -171,7 +180,7 @@ class MapInfoFinalizationPass
           fir::StoreOp::create(builder, loc, descriptor, alloca);
         })
         .end();
-    return slot = alloca;
+    return alloca;
   }
 
   /// Function that generates a FIR operation accessing the descriptor's
diff --git a/flang/test/Lower/OpenMP/optional-argument-map-3.f90 b/flang/test/Lower/OpenMP/optional-argument-map-3.f90
new file mode 100644
index 0000000000000..f1c1ac2d0bad0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/optional-argument-map-3.f90
@@ -0,0 +1,46 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module mod
+contains
+      subroutine foo( dt, switch )
+        implicit none
+        real(4), dimension(:), INTENT(INOUT) :: dt
+        logical, INTENT(IN) :: switch
+        integer :: dim, idx
+
+        if ( switch ) then
+!$omp target teams distribute parallel do
+            do idx = 1, 100
+            dt(idx) = 20
+            end do
+        else
+!$omp target teams distribute parallel do
+            do idx = 1, 100
+            dt(idx) = 30
+            end do
+        end if
+      end subroutine foo
+end module
+
+! CHECK-LABEL:   func.func @{{.*}}(
+! CHECK-SAME:      %[[ARG0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "dt"},
+! CHECK:           %[[VAL_0:.*]] = fir.alloca !fir.box<!fir.array<?xf32>>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope {{.*}}
+! CHECK:           fir.if %{{.*}} {
+! CHECK:             %[[VAL_2:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
+! CHECK:             fir.if %[[VAL_2]] {
+! CHECK:               fir.store %[[VAL_1]]#1 to %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:             }
+! CHECK:             %[[VAL_3:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>
+! CHECK:             %[[VAL_4:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_3]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>> {name = ""}
+! CHECK:             %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.box<!fir.array<?xf32>>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_4]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) -> !fir.ref<!fir.array<?xf32>> {name = "dt"}
+! CHECK:             omp.target host_eval({{.*}}) map_entries({{.*}}%[[VAL_5]] -> {{.*}}, %[[VAL_4]] -> {{.*}} : {{.*}}) {
+! CHECK:           } else {
+! CHECK:             %[[VAL_6:.*]] = fir.is_present %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> i1
+! CHECK:             fir.if %[[VAL_6]] {
+! CHECK:               fir.store %[[VAL_1]]#1 to %[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
+! CHECK:             }
+! CHECK:             %[[VAL_7:.*]] = fir.box_offset %[[VAL_0]] base_addr : (!fir.ref<!fir.box<!fir.array<?xf32>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>
+! CHECK:             %[[VAL_8:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%[[VAL_7]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) bounds(%{{.*}}) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>> {name = ""}
+! CHECK:             %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !fir.ref<!fir.box<!fir.array<?xf32>>>, !fir.box<!fir.array<?xf32>>) map_clauses(implicit, to) capture(ByRef) members(%[[VAL_8]] : [0] : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>) -> !fir.ref<!fir.array<?xf32>> {name = "dt"}
+! CHECK:             omp.target host_eval({{.*}}) map_entries({{.*}}, %[[VAL_9]] ->{{.*}}, %[[VAL_8]] -> {{.*}} : {{.*}}) {
diff --git a/offload/test/offloading/fortran/optional-mapped-arguments-3.f90 b/offload/test/offloading/fortran/optional-mapped-arguments-3.f90
new file mode 100644
index 0000000000000..1059342d3112e
--- /dev/null
+++ b/offload/test/offloading/fortran/optional-mapped-arguments-3.f90
@@ -0,0 +1,45 @@
+! OpenMP offloading test that checks we do not cause a segfault when mapping
+! optional function arguments (present or otherwise). No results requiring
+! checking other than that the program compiles and runs to completion with no
+! error. This particular variation checks that we're correctly emitting the
+! load/store in both branches mapping the input array.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+module reproducer_mod
+contains
+      subroutine branching_target_call( dt, switch )
+      implicit none
+      real(4), dimension(:), INTENT(INOUT) :: dt
+      logical, INTENT(IN) :: switch
+      integer :: dim, idx
+
+      dim = size(dt)
+      if ( switch ) then
+!$omp target teams distribute parallel do
+        do idx = 1, dim
+          dt(idx) = 20
+        end do
+      else
+!$omp target teams distribute parallel do
+        do idx = 1, dim
+          dt(idx) = 30
+        end do
+      end if
+      end subroutine branching_target_call
+end module reproducer_mod
+
+program reproducer
+  use reproducer_mod
+  implicit none
+  real(4), dimension(:), allocatable :: dt
+  integer :: n = 21312
+  integer :: i
+
+  allocate(dt(n))
+  call branching_target_call(dt, .FALSE.)
+  call branching_target_call(dt, .TRUE.)
+  print *, "PASSED"
+end program reproducer
+
+! CHECK: PASSED

>From e3538b8e13188792b6f447ad3f2552d9cd3b5663 Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon at amd.com>
Date: Thu, 24 Jul 2025 17:11:30 -0500
Subject: [PATCH 2/2] Tidy up tests

---
 .../Lower/OpenMP/optional-argument-map-3.f90  | 26 ++++++------
 .../fortran/optional-mapped-arguments-3.f90   | 40 +++++++++----------
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/flang/test/Lower/OpenMP/optional-argument-map-3.f90 b/flang/test/Lower/OpenMP/optional-argument-map-3.f90
index f1c1ac2d0bad0..7e2a24e31123e 100644
--- a/flang/test/Lower/OpenMP/optional-argument-map-3.f90
+++ b/flang/test/Lower/OpenMP/optional-argument-map-3.f90
@@ -2,24 +2,24 @@
 
 module mod
 contains
-      subroutine foo( dt, switch )
-        implicit none
-        real(4), dimension(:), INTENT(INOUT) :: dt
-        logical, INTENT(IN) :: switch
-        integer :: dim, idx
+   subroutine foo(dt, switch)
+      implicit none
+      real(4), dimension(:), intent(inout) :: dt
+      logical, intent(in) :: switch
+      integer :: dim, idx
 
-        if ( switch ) then
+      if (switch) then
 !$omp target teams distribute parallel do
-            do idx = 1, 100
+         do idx = 1, 100
             dt(idx) = 20
-            end do
-        else
+         end do
+      else
 !$omp target teams distribute parallel do
-            do idx = 1, 100
+         do idx = 1, 100
             dt(idx) = 30
-            end do
-        end if
-      end subroutine foo
+         end do
+      end if
+   end subroutine foo
 end module
 
 ! CHECK-LABEL:   func.func @{{.*}}(
diff --git a/offload/test/offloading/fortran/optional-mapped-arguments-3.f90 b/offload/test/offloading/fortran/optional-mapped-arguments-3.f90
index 1059342d3112e..cf82a854b5474 100644
--- a/offload/test/offloading/fortran/optional-mapped-arguments-3.f90
+++ b/offload/test/offloading/fortran/optional-mapped-arguments-3.f90
@@ -8,38 +8,38 @@
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 module reproducer_mod
 contains
-      subroutine branching_target_call( dt, switch )
+   subroutine branching_target_call(dt, switch)
       implicit none
-      real(4), dimension(:), INTENT(INOUT) :: dt
-      logical, INTENT(IN) :: switch
+      real(4), dimension(:), intent(inout) :: dt
+      logical, intent(in) :: switch
       integer :: dim, idx
 
       dim = size(dt)
-      if ( switch ) then
+      if (switch) then
 !$omp target teams distribute parallel do
-        do idx = 1, dim
-          dt(idx) = 20
-        end do
+         do idx = 1, dim
+            dt(idx) = 20
+         end do
       else
 !$omp target teams distribute parallel do
-        do idx = 1, dim
-          dt(idx) = 30
-        end do
+         do idx = 1, dim
+            dt(idx) = 30
+         end do
       end if
-      end subroutine branching_target_call
+   end subroutine branching_target_call
 end module reproducer_mod
 
 program reproducer
-  use reproducer_mod
-  implicit none
-  real(4), dimension(:), allocatable :: dt
-  integer :: n = 21312
-  integer :: i
+   use reproducer_mod
+   implicit none
+   real(4), dimension(:), allocatable :: dt
+   integer :: n = 21312
+   integer :: i
 
-  allocate(dt(n))
-  call branching_target_call(dt, .FALSE.)
-  call branching_target_call(dt, .TRUE.)
-  print *, "PASSED"
+   allocate (dt(n))
+   call branching_target_call(dt, .FALSE.)
+   call branching_target_call(dt, .TRUE.)
+   print *, "PASSED"
 end program reproducer
 
 ! CHECK: PASSED



More information about the llvm-commits mailing list