[llvm-branch-commits] [flang] [flang][OpenMP] `do concurrent` to device mapping lit tests (PR #155992)

Mon Sep 1 22:25:59 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/155992

>From 77181e62b5b28424f0bbaad96cbc9820c9fadc53 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 29 Aug 2025 03:53:51 -0500
Subject: [PATCH] [flang][OpenMP] `do concurrent` to device mapping lit tests

Adds more lit tests for `do concurrent` device mapping.
---
 .../Transforms/DoConcurrent/allocatable.f90   |  29 +++++
 .../Transforms/DoConcurrent/host_eval.f90     |  63 +++++++++++
 .../DoConcurrent/locally_destroyed_temp.f90   |  43 ++++---
 .../DoConcurrent/map_shape_info.f90           | 104 +++++++++++++++++
 .../multiple_iteration_ranges.f90             | 106 +++++++++++-------
 .../DoConcurrent/non_reference_to_device.f90  |  34 ++++++
 .../DoConcurrent/not_perfectly_nested.f90     |  66 +++++++----
 .../DoConcurrent/runtime_sized_array.f90      |  42 +++++++
 .../DoConcurrent/skip_all_nested_loops.f90    |  68 +++++++++++
 9 files changed, 478 insertions(+), 77 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/allocatable.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/host_eval.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/map_shape_info.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
 create mode 100644 flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90

diff --git a/flang/test/Transforms/DoConcurrent/allocatable.f90 b/flang/test/Transforms/DoConcurrent/allocatable.f90
new file mode 100644
index 0000000000000..03962f150eb95
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/allocatable.f90
@@ -0,0 +1,29 @@
+! Verifies that proper `omp.map.bounds` ops are emitted when an allocatable is
+! implicitly mapped by a `do concurrent` loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+program main
+   implicit none
+
+   integer,parameter :: n = 1000000
+   real, allocatable, dimension(:) :: y
+   integer :: i
+
+   allocate(y(1:n))
+
+   do concurrent(i=1:n)
+       y(i) = 42
+   end do
+
+   deallocate(y)
+end program main
+
+! CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEy"}
+! CHECK: %[[Y_VAL:.*]] = fir.load %[[Y_DECL]]#0
+! CHECK: %[[Y_DIM0:.*]]:3 = fir.box_dims %[[Y_VAL]], %{{c0_.*}}
+! CHECK: %[[Y_LB:.*]] = arith.constant 0 : index
+! CHECK: %[[Y_UB:.*]] = arith.subi %[[Y_DIM0]]#1, %{{c1_.*}} : index
+! CHECK: %[[Y_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[Y_LB]] : index) upper_bound(%[[Y_UB]] : index) extent(%[[Y_DIM0]]#1 : index)
+! CHECK: %[[MEM_MAP:.*]] = omp.map.info {{.*}} bounds(%[[Y_BOUNDS]])
+! CHECK: omp.map.info var_ptr(%[[Y_DECL]]#1 : {{.*}}) {{.*}} members(%[[MEM_MAP]] : {{.*}})
diff --git a/flang/test/Transforms/DoConcurrent/host_eval.f90 b/flang/test/Transforms/DoConcurrent/host_eval.f90
new file mode 100644
index 0000000000000..7d16a91ae6941
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/host_eval.f90
@@ -0,0 +1,63 @@
+! Tests `host_eval` clause code-gen and loop nest bounds on host vs. device.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa   \
+! RUN:   -fdo-concurrent-to-openmp=device %s -o -                           \
+! RUN: | FileCheck %s --check-prefix=HOST -vv
+
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir -fopenmp            \
+! RUN:   -fopenmp-is-target-device -fdo-concurrent-to-openmp=device %s -o - \
+! RUN: | FileCheck %s --check-prefix=DEVICE
+
+program do_concurrent_host_eval
+    implicit none
+    integer :: i, j
+
+    do concurrent (i=1:10, j=1:20)
+    end do
+end program do_concurrent_host_eval
+
+! HOST: omp.target host_eval(
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[I_LB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[I_UB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[I_ST:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[J_LB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[J_UB:[^,]+]],
+! HOST-SAME:    %{{[^[:space:]]+}} -> %[[J_ST:[^,]+]] : {{.*}}) map_entries
+
+! HOST: omp.loop_nest ({{.*}}, {{.*}}) : index = (%[[I_LB]], %[[J_LB]]) to
+! HOST-SAME:    (%[[I_UB]], %[[J_UB]]) inclusive step
+! HOST-SAME:    (%[[I_ST]], %[[J_ST]])
+
+! DEVICE: omp.target map_entries(
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[I_LB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[I_UB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[I_ST_MAP:[^,]+]],
+
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[J_LB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[J_UB_MAP:[^,]+]],
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %[[J_ST_MAP:[^,]+]],
+
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:  %{{[^[:space:]]+}} -> %{{[^,]+}} : {{.*}})
+
+! DEVICE: %[[I_LB_DECL:.*]]:2 = hlfir.declare %[[I_LB_MAP]]
+! DEVICE: %[[I_LB:.*]] = fir.load %[[I_LB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[I_UB_DECL:.*]]:2 = hlfir.declare %[[I_UB_MAP]]
+! DEVICE: %[[I_UB:.*]] = fir.load %[[I_UB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[I_ST_DECL:.*]]:2 = hlfir.declare %[[I_ST_MAP]]
+! DEVICE: %[[I_ST:.*]] = fir.load %[[I_ST_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[J_LB_DECL:.*]]:2 = hlfir.declare %[[J_LB_MAP]]
+! DEVICE: %[[J_LB:.*]] = fir.load %[[J_LB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[J_UB_DECL:.*]]:2 = hlfir.declare %[[J_UB_MAP]]
+! DEVICE: %[[J_UB:.*]] = fir.load %[[J_UB_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: %[[J_ST_DECL:.*]]:2 = hlfir.declare %[[J_ST_MAP]]
+! DEVICE: %[[J_ST:.*]] = fir.load %[[J_ST_DECL]]#1 : !fir.ref<index>
+
+! DEVICE: omp.loop_nest ({{.*}}, {{.*}}) : index = (%[[I_LB]], %[[J_LB]]) to
+! DEVICE-SAME:    (%[[I_UB]], %[[J_UB]]) inclusive step
+! DEVICE-SAME:    (%[[I_ST]], %[[J_ST]])
diff --git a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90 b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
index f82696669eca6..28429cebf8587 100644
--- a/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
+++ b/flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90
@@ -1,9 +1,14 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+
 ! Tests that "loop-local values" are properly handled by localizing them to the
 ! body of the loop nest. See `collectLoopLocalValues` and `localizeLoopLocalValue`
 ! for a definition of "loop-local values" and how they are handled.
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefixes=COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=COMMON,DEVICE
 module struct_mod
     type test_struct
         integer, allocatable :: x_
@@ -46,17 +51,25 @@ program main
     print *, "total =", total
 end program main
 
-! CHECK: omp.parallel {
-! CHECK:   %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box<!fir.heap<i32>>}> {bindc_name = ".result"}
-! CHECK:   omp.wsloop {
-! CHECK:     omp.loop_nest {{.*}} {
-! CHECK:       %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components
-! CHECK:       fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]]
-! CHECK:       %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]]
-! CHECK:       %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]]
-! CHECK:       fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]])
-! CHECK:       omp.yield
-! CHECK:     }
-! CHECK:   }
-! CHECK:   omp.terminator
-! CHECK: }
+! DEVICE: omp.target {{.*}} {
+! DEVICE: omp.teams {
+! COMMON: omp.parallel {
+! COMMON:   %[[LOCAL_TEMP:.*]] = fir.alloca !fir.type<_QMstruct_modTtest_struct{x_:!fir.box<!fir.heap<i32>>}> {bindc_name = ".result"}
+! DEVICE:   omp.distribute {
+! COMMON:   omp.wsloop {
+! COMMON:     omp.loop_nest {{.*}} {
+! COMMON:       %[[TEMP_VAL:.*]] = fir.call @_QMstruct_modPconstruct_from_components
+! COMMON:       fir.save_result %[[TEMP_VAL]] to %[[LOCAL_TEMP]]
+! COMMON:       %[[EMBOXED_LOCAL:.*]] = fir.embox %[[LOCAL_TEMP]]
+! COMMON:       %[[CONVERTED_LOCAL:.*]] = fir.convert %[[EMBOXED_LOCAL]]
+! COMMON:       fir.call @_FortranADestroy(%[[CONVERTED_LOCAL]])
+! COMMON:       omp.yield
+! COMMON:     }
+! COMMON:   }
+! DEVICE:   }
+! COMMON:   omp.terminator
+! COMMON: }
+! DEVICE: omp.terminator
+! DEVICE: }
+! DEVICE: omp.terminator
+! DEVICE: }
diff --git a/flang/test/Transforms/DoConcurrent/map_shape_info.f90 b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
new file mode 100644
index 0000000000000..3dca1340ae6b9
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/map_shape_info.f90
@@ -0,0 +1,104 @@
+! Tests mapping of a basic `do concurrent` loop to
+! `!$omp target teams distribute parallel do`.
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+program do_concurrent_shape
+    implicit none
+    integer :: a(10, 20)
+    integer :: i, j
+
+    do concurrent (i=1:10, j=1:20)
+        a(i, j) = i * j
+    end do
+end program do_concurrent_shape
+
+! CHECK: fir.store %{{c10.*}} to %[[DIM0_EXT:.*]] : !fir.ref<index>
+! CHECK: fir.store %{{c20.*}} to %[[DIM1_EXT:.*]] : !fir.ref<index>
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info 
+! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc) 
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim0"}
+
+! CHECK: %[[DIM1_EXT_MAP:.*]] = omp.map.info
+! CHECK-SAME:   var_ptr(%[[DIM1_EXT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QFEa.extent.dim1"}
+
+! CHECK: omp.target host_eval({{.*}}) map_entries(
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %[[DIM0_EXT_MAP]] -> %[[DIM0_EXT_ARG:[^,]+]],
+! CHECK-SAME:   %[[DIM1_EXT_MAP]] -> %[[DIM1_EXT_ARG:[^,]+]] : {{.*}})
+
+! CHECK-DAG:    %[[DIM0_EXT_DEV:.*]] = fir.load %[[DIM0_EXT_ARG]]
+! CHECK-DAG:    %[[DIM1_EXT_DEV:.*]] = fir.load %[[DIM1_EXT_ARG]]
+
+! CHECK:        %[[SHAPE:.*]] = fir.shape %[[DIM0_EXT_DEV]], %[[DIM1_EXT_DEV]]
+! CHECK:        %{{.*}}:2 = hlfir.declare %{{.*}}(%[[SHAPE]]) {uniq_name = "_QFEa"}
+
+subroutine do_concurrent_shape_shift
+    implicit none
+    integer :: a(2:10)
+    integer :: i
+
+    do concurrent (i=1:10)
+        a(i) = i
+    end do
+end subroutine do_concurrent_shape_shift
+
+! CHECK: fir.store %{{c2.*}} to %[[DIM0_STRT:.*]] : !fir.ref<index>
+! CHECK: fir.store %{{c9.*}} to %[[DIM0_EXT:.*]] : !fir.ref<index>
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: omp.map.info
+! CHECK: omp.map.info
+
+! CHECK: %[[DIM0_STRT_MAP:.*]] = omp.map.info 
+! CHECK-SAME:   var_ptr(%[[DIM0_STRT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc) 
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.start_idx.dim0"}
+
+! CHECK: %[[DIM0_EXT_MAP:.*]] = omp.map.info
+! CHECK-SAME:   var_ptr(%[[DIM0_EXT]] : !fir.ref<index>, index)
+! CHECK-SAME:   map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:   capture(ByCopy) -> !fir.ref<index> {name = "_QF{{.*}}Ea.extent.dim0"}
+
+! CHECK: omp.target host_eval({{.*}}) map_entries(
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:   %[[DIM0_STRT_MAP]] -> %[[DIM0_STRT_ARG:[^,]+]],
+! CHECK-SAME:   %[[DIM0_EXT_MAP]] -> %[[DIM0_EXT_ARG:[^,]+]] : {{.*}})
+
+! CHECK-DAG:    %[[DIM0_STRT_DEV:.*]] = fir.load %[[DIM0_STRT_ARG]]
+! CHECK-DAG:    %[[DIM0_EXT_DEV:.*]] = fir.load %[[DIM0_EXT_ARG]]
+
+! CHECK:        %[[SHAPE_SHIFT:.*]] = fir.shape_shift %[[DIM0_STRT_DEV]], %[[DIM0_EXT_DEV]]
+! CHECK:        %{{.*}}:2 = hlfir.declare %{{.*}}(%[[SHAPE_SHIFT]]) {uniq_name = "_QF{{.*}}Ea"}
+
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index d0210726de83e..3ea32f9f4cecc 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -1,9 +1,14 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+
 ! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
 
 ! RUN: split-file %s %t
 
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefixes=HOST,COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %t/multi_range.f90 -o - \
+! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
 
 !--- multi_range.f90
 program main
@@ -17,56 +22,75 @@ program main
    end do
 end
 
-! CHECK: func.func @_QQmain
+! COMMON: func.func @_QQmain
+
+! COMMON: %[[C3:.*]] = arith.constant 3 : i32
+! COMMON: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
+! COMMON: %[[C20:.*]] = arith.constant 20 : i32
+! COMMON: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
+! COMMON: %[[STEP_I:.*]] = arith.constant 1 : index
+
+! COMMON: %[[C5:.*]] = arith.constant 5 : i32
+! COMMON: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
+! COMMON: %[[C40:.*]] = arith.constant 40 : i32
+! COMMON: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
+! COMMON: %[[STEP_J:.*]] = arith.constant 1 : index
+
+! COMMON: %[[C7:.*]] = arith.constant 7 : i32
+! COMMON: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
+! COMMON: %[[C60:.*]] = arith.constant 60 : i32
+! COMMON: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
+! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index
+
+! DEVICE: omp.target host_eval(
+! DEVICE-SAME: %[[LB_I]] -> %[[LB_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[UB_I]] -> %[[UB_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[STEP_I]] -> %[[STEP_I:[[:alnum:]]+]],
+! DEVICE-SAME: %[[LB_J]] -> %[[LB_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[UB_J]] -> %[[UB_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[STEP_J]] -> %[[STEP_J:[[:alnum:]]+]],
+! DEVICE-SAME: %[[LB_K]] -> %[[LB_K:[[:alnum:]]+]],
+! DEVICE-SAME: %[[UB_K]] -> %[[UB_K:[[:alnum:]]+]],
+! DEVICE-SAME: %[[STEP_K]] -> %[[STEP_K:[[:alnum:]]+]] :
+! DEVICE-SAME: index, index, index, index, index, index, index, index, index)
 
-! CHECK: %[[C3:.*]] = arith.constant 3 : i32
-! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
-! CHECK: %[[C20:.*]] = arith.constant 20 : i32
-! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
-! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index
+! DEVICE: omp.teams
 
-! CHECK: %[[C5:.*]] = arith.constant 5 : i32
-! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
-! CHECK: %[[C40:.*]] = arith.constant 40 : i32
-! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
-! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index
+! HOST-NOT: omp.target
+! HOST-NOT: omp.teams
 
-! CHECK: %[[C7:.*]] = arith.constant 7 : i32
-! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
-! CHECK: %[[C60:.*]] = arith.constant 60 : i32
-! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
-! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index
+! COMMON: omp.parallel {
 
-! CHECK: omp.parallel {
+! COMMON-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
+! COMMON-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
 
-! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
-! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
+! COMMON-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
+! COMMON-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
 
-! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
+! COMMON-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
+! COMMON-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
 
-! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
+! DEVICE: omp.distribute
 
-! CHECK: omp.wsloop {
-! CHECK-NEXT: omp.loop_nest
-! CHECK-SAME:   (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
-! CHECK-SAME:   : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
-! CHECK-SAME:     to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
-! CHECK-SAME:     step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
+! COMMON: omp.wsloop {
+! COMMON-NEXT: omp.loop_nest
+! COMMON-SAME:   (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
+! COMMON-SAME:   : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
+! COMMON-SAME:     to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
+! COMMON-SAME:     step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
 
-! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
-! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#0
+! COMMON-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
+! COMMON-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#0
 
-! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
-! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#0
+! COMMON-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
+! COMMON-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#0
 
-! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
-! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#0
+! COMMON-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
+! COMMON-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#0
 
-! CHECK:      omp.yield
-! CHECK-NEXT: }
-! CHECK-NEXT: }
+! COMMON:      omp.yield
+! COMMON-NEXT: }
+! COMMON-NEXT: }
 
-! CHECK-NEXT: omp.terminator
-! CHECK-NEXT: }
+! HOST-NEXT: omp.terminator
+! HOST-NEXT: }
diff --git a/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90 b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
new file mode 100644
index 0000000000000..b6b2136e2d405
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/non_reference_to_device.f90
@@ -0,0 +1,34 @@
+! Tests that we can map "unnamed" and non-reference/non-box values to device; for
+! example, values that result from `fix.box_dims` ops.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+subroutine test_non_refernece
+  integer i
+  real, allocatable :: arr(:)
+
+  associate(a => arr)
+    do concurrent (i = 1:10)
+      block
+        real z(size(a,1))
+      end block
+    end do
+  end associate
+end subroutine test_non_refernece
+
+! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+! CHECK: omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+
+! CHECK:      %[[DIM_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : !fir.ref<index>, index)
+! CHECK-SAME:                     map_clauses(implicit, exit_release_or_enter_alloc)
+! CHECK-SAME:                     capture(ByCopy) -> !fir.ref<index> {name = ""}
+
+
+! CHECK:      omp.target host_eval({{.*}} : index, index, index)
+! CHECK-SAME:   map_entries({{.*}}, %[[DIM_MAP]] -> %{{.*}} :
+! CHECK-SAME:               !fir.ref<i32>, !fir.ref<index>)
+
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index 74799359e0476..c87cf392bd5d6 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -1,8 +1,14 @@
+! Fails until we update the pass to use the `fir.do_concurrent` op.
+
 ! Tests that if `do concurrent` is not perfectly nested in its parent loop, that
 ! we skip converting the not-perfectly nested `do concurrent` loop.
 
+
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
-! RUN:   | FileCheck %s
+! RUN:   | FileCheck %s --check-prefixes=COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
 
 program main
    integer, parameter :: n = 10
@@ -19,28 +25,46 @@ program main
    end do
 end
 
-! CHECK: omp.parallel {
 
-! CHECK: omp.wsloop {
-! CHECK: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
-! CHECK:   fir.do_concurrent {
 
-! CHECK:     %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
-! CHECK:     %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+! DEVICE: omp.target {{.*}}map_entries(
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[I_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[X_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[A_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^:]+}} :
+! DEVICE-SAME:   {{.*}}) {
+
+! DEVICE: omp.teams
+
+! COMMON: omp.parallel {
+
+! DEVICE: omp.distribute
+
+! COMMON: omp.wsloop {
+! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
+! COMMON:   fir.do_concurrent {
+
+! COMMON:     %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j"}
+! COMMON:     %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
 
-! CHECK:     %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
-! CHECK:     %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+! COMMON:     %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! COMMON:     %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
 
-! CHECK:     fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} {
-! CHECK:       %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
-! CHECK:       fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0
+! COMMON:     fir.do_concurrent.loop (%[[J_IV:.*]], %[[K_IV:.*]]) = {{.*}} {
+! COMMON:       %[[J_IV_CONV:.*]] = fir.convert %[[J_IV]] : (index) -> i32
+! COMMON:       fir.store %[[J_IV_CONV]] to %[[ORIG_J_DECL]]#0
 
-! CHECK:       %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
-! CHECK:       fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
-! CHECK:     }
-! CHECK:   }
-! CHECK: omp.yield
-! CHECK: }
-! CHECK: }
-! CHECK: omp.terminator
-! CHECK: }
+! COMMON:       %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+! COMMON:       fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
+! COMMON:     }
+! COMMON:   }
+! COMMON: omp.yield
+! COMMON: }
+! COMMON: }
+! COMMON: omp.terminator
+! COMMON: }
diff --git a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
new file mode 100644
index 0000000000000..e38474a68747f
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90
@@ -0,0 +1,42 @@
+! Tests `do concurrent` mapping when mapped value(s) depend on values defined
+! outside the target region; e.g. the size of the array is dynamic. This needs
+! to be handled by localizing these region outsiders by either cloning them in
+! the region or in case we cannot do that, map them and use the mapped values.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s
+
+subroutine foo(n)
+  implicit none
+  integer :: n
+  integer :: i
+  integer, dimension(n) :: a
+
+  do concurrent(i=1:10)
+    a(i) = i
+  end do
+end subroutine
+
+! CHECK-DAG: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFfooEi"}
+! CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFfooEa"}
+
+! CHECK-DAG: %[[I_MAP:.*]] = omp.map.info var_ptr(%[[I_DECL]]#1 : {{.*}}) {{.*}} {name = "_QFfooEi"}
+! CHECK-DAG: %[[A_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : {{.*}}) {{.*}} {name = "_QFfooEa"}
+! CHECK-DAG: %[[N_MAP:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}) {{.*}} {name = "_QFfooEa.extent.dim0"}
+
+! CHECK: omp.target
+! CHECK-SAME: map_entries(
+! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:     %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! CHECK-SAME:     %[[I_MAP]] -> %[[I_ARG:arg[0-9]*]],
+! CHECK-SAME:     %[[A_MAP]] -> %[[A_ARG:arg[0-9]*]],
+! CHECK-SAME:     %[[N_MAP]] -> %[[N_ARG:arg[0-9]*]] : {{.*}})
+! CHECK-SAME: {{.*}} {
+
+! CHECK-DAG:  %{{.*}} = hlfir.declare %[[I_ARG]]
+! CHECK-DAG:  %{{.*}} = hlfir.declare %[[A_ARG]]
+! CHECK-DAG:  %{{.*}} = fir.load %[[N_ARG]]
+
+! CHECK:   omp.terminator
+! CHECK: }
diff --git a/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90 b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
new file mode 100644
index 0000000000000..2dada05396ad6
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
@@ -0,0 +1,68 @@
+! Tests that if `do concurrent` is indirectly nested in its parent loop, that we
+! skip converting the indirectly nested `do concurrent` loop.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=HOST,COMMON
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=device %s -o - \
+! RUN:   | FileCheck %s --check-prefixes=DEVICE,COMMON
+
+program main
+   integer, parameter :: n = 10
+   integer, parameter :: m = 20
+   integer, parameter :: l = 30
+   integer x;
+   integer :: a(n, m, l)
+
+   do concurrent(i=1:n)
+     do j=1,m
+       do concurrent(k=1:l)
+         a(i,j,k) = i * j + k
+       end do
+     end do
+   end do
+end
+
+! HOST: %[[ORIG_J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j", {{.*}}}
+! HOST: %[[ORIG_J_DECL:.*]]:2 = hlfir.declare %[[ORIG_J_ALLOC]]
+
+! DEVICE: omp.target {{.*}}map_entries(
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[[:alnum:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[I_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[J_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %[[A_ARG:[^,]+]],
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^,]+}},
+! DEVICE-SAME:   %{{[^[:space:]]+}} -> %{{[^:]+}} :
+! DEVICE-SAME:   {{.*}}) {
+
+! DEVICE: %[[TARGET_J_DECL:.*]]:2 = hlfir.declare %[[J_ARG]] {uniq_name = "_QFEj"}
+
+! DEVICE: omp.teams
+
+! COMMON: omp.parallel {
+
+! DEVICE: omp.distribute
+
+! COMMON: omp.wsloop {
+! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
+! COMMON:   fir.do_loop {{.*}} iter_args(%[[J_IV:.*]] = {{.*}}) -> {{.*}} {
+! HOST:       fir.store %[[J_IV]] to %[[ORIG_J_DECL]]#0
+! DEVICE:     fir.store %[[J_IV]] to %[[TARGET_J_DECL]]#0
+
+! COMMON:     fir.do_concurrent {
+! COMMON:         %[[ORIG_K_ALLOC:.*]] = fir.alloca i32 {bindc_name = "k"}
+! COMMON:         %[[ORIG_K_DECL:.*]]:2 = hlfir.declare %[[ORIG_K_ALLOC]]
+! COMMON:       fir.do_concurrent.loop (%[[K_IV:.*]]) = {{.*}} {
+! COMMON:         %[[K_IV_CONV:.*]] = fir.convert %[[K_IV]] : (index) -> i32
+! COMMON:           fir.store %[[K_IV_CONV]] to %[[ORIG_K_DECL]]#0
+! COMMON:       }
+! COMMON:     }
+! COMMON:   }
+! COMMON: omp.yield
+! COMMON: }
+! COMMON: }
+! COMMON: omp.terminator
+! COMMON: }