[clang] [flang] [llvm] [mlir] [OpenMP][flang] Add initial support for by-ref reductions on the GPU (PR #165714)

Kareem Ergawy via cfe-commits cfe-commits at lists.llvm.org
Fri Oct 31 03:12:43 PDT 2025


https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/165714

>From 1fdd4e700c9f77f927e65f291b055e3f7ce3bca2 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 17 Oct 2025 08:35:07 -0500
Subject: [PATCH] [OpenMP][flang] Add initial support for by-ref reductions on
 the GPU

Adds initial support for GPU by-ref reductions. In particular, this diff
adds support for reductions on scalar allocatables where reductions
happen on loops nested in `target` regions. For example:

```fortran
  integer :: i
  real, allocatable :: scalar_alloc

  allocate(scalar_alloc)
  scalar_alloc = 0

  !$omp target map(tofrom: scalar_alloc)
  !$omp parallel do reduction(+: scalar_alloc)
  do i = 1, 1000000
    scalar_alloc = scalar_alloc + 1
  end do
  !$omp end target
```

This PR supports by-ref reductions on the intra- and inter-warp levels.

So far, there are still steps to be takens for full support of by-ref
reductions, for example:
* Support inter-block value combination is still not supported.
  Therefore, `target teams distribute parallel do` is still not
  supported.
* Support for dynamically-sized arrays still needs to be added.
* Support for more than one allocatable/array on the same `reduction`
  clause.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   4 +-
 .../include/flang/Optimizer/Dialect/FIROps.td |   3 +-
 .../lib/Lower/Support/ReductionProcessor.cpp  |  10 +-
 .../OpenMP/DoConcurrentConversion.cpp         |   3 +-
 .../delayed-privatization-reduction-byref.f90 |   2 +-
 .../parallel-reduction-allocatable-array.f90  |   2 +-
 .../OpenMP/parallel-reduction-array-lb.f90    |   2 +-
 .../Lower/OpenMP/parallel-reduction-array.f90 |   2 +-
 .../OpenMP/parallel-reduction-array2.f90      |   2 +-
 .../parallel-reduction-pointer-array.f90      |   2 +-
 .../test/Lower/OpenMP/parallel-reduction3.f90 |   2 +-
 .../OpenMP/reduction-array-intrinsic.f90      |   2 +-
 .../Lower/OpenMP/sections-array-reduction.f90 |   2 +-
 .../OpenMP/taskgroup-task-array-reduction.f90 |   2 +-
 ...oop-reduction-allocatable-array-minmax.f90 |   4 +-
 .../OpenMP/wsloop-reduction-allocatable.f90   |   2 +-
 .../wsloop-reduction-array-assumed-shape.f90  |   2 +-
 .../OpenMP/wsloop-reduction-array-lb.f90      |   2 +-
 .../OpenMP/wsloop-reduction-array-lb2.f90     |   2 +-
 .../Lower/OpenMP/wsloop-reduction-array.f90   |   2 +-
 .../Lower/OpenMP/wsloop-reduction-array2.f90  |   2 +-
 .../wsloop-reduction-multiple-clauses.f90     |   2 +-
 .../Lower/OpenMP/wsloop-reduction-pointer.f90 |   2 +-
 .../do_concurrent_reduce_allocatable.f90      |   2 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  53 ++++--
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 161 ++++++++++++++----
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |   4 +-
 .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp    |   3 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  24 ++-
 .../LLVMIR/allocatable_gpu_reduction.mlir     |  92 ++++++++++
 .../omptarget-multi-block-reduction.mlir      |   6 +-
 .../LLVMIR/omptarget-multi-reduction.mlir     |   8 +-
 .../omptarget-teams-distribute-reduction.mlir |   2 +-
 .../LLVMIR/omptarget-teams-reduction.mlir     |   2 +-
 34 files changed, 327 insertions(+), 90 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index fddeba98adccc..ed6ccab06181a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1784,8 +1784,8 @@ void CGOpenMPRuntimeGPU::emitReduction(
 
   llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
       cantFail(OMPBuilder.createReductionsGPU(
-          OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
-          llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+          OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, false,
+          TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
           CGF.getTarget().getGridValue(),
           C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc));
   CGF.Builder.restoreIP(AfterIP);
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 58a317cf5d691..ff4dab1136ee9 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3743,7 +3743,8 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove,
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name,
-                       TypeAttr:$type);
+                       TypeAttr:$type,
+                       OptionalAttr<TypeAttr>:$byref_element_type);
 
   let regions = (region MaxSizedRegion<1>:$allocRegion,
                         AnyRegion:$initializerRegion,
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 605a5b6b20b94..1bce6b3b3c832 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -573,10 +573,18 @@ OpType ReductionProcessor::createDeclareReduction(
 
   mlir::OpBuilder modBuilder(module.getBodyRegion());
   mlir::Type valTy = fir::unwrapRefType(type);
+  // For by-ref reductions, we want to keep track of the
+  // boxed/referenced/allocated type. For example, a for `real, allocatable`
+  // variable, `real` should be stored.
+  mlir::TypeAttr boxedTy{};
+
   if (!isByRef)
     type = valTy;
 
-  decl = OpType::create(modBuilder, loc, reductionOpName, type);
+  if (isByRef)
+    boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy));
+
+  decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy);
   createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
                                      isByRef);
 
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 1229018bd9b3e..11609ea7b6040 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -851,7 +851,8 @@ class DoConcurrentConversion
         if (!ompReducer) {
           ompReducer = mlir::omp::DeclareReductionOp::create(
               rewriter, firReducer.getLoc(), ompReducerName,
-              firReducer.getTypeAttr().getValue());
+              firReducer.getTypeAttr().getValue(),
+              firReducer.getByrefElementTypeAttr());
 
           cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
                               ompReducer.getAllocRegion());
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
index 4b6a643f94059..4c7b6ac5f5f9b 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
@@ -22,7 +22,7 @@ subroutine red_and_delayed_private
 ! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32
 
 ! CHECK-LABEL: omp.declare_reduction
-! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> alloc
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> attributes {byref_element_type = i32} alloc
 
 ! CHECK-LABEL: _QPred_and_delayed_private
 ! CHECK: omp.parallel
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
index 41c7d69ebb3ba..f56875dcb518b 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
@@ -18,7 +18,7 @@ program reduce
 
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc {
 ! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:           omp.yield(%[[VAL_10]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
index aa91e1e0e8b15..d9ba3bed464f8 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
@@ -12,7 +12,7 @@ program reduce
 
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_15:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
 ! CHECK:           omp.yield(%[[VAL_15]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
index 59595de338d50..636660f279e85 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -17,7 +17,7 @@ program reduce
 print *,i
 end program
 
-! CPU-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
+! CPU-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> attributes {byref_element_type = !fir.array<3xi32>} alloc {
 ! CPU:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
 ! CPU:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
 ! CPU-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
index 14338c6f50817..9cf8a63427ed1 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -13,7 +13,7 @@ program reduce
 print *,i
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
 ! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index 36344458d1cae..3de2ba8f61f8e 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -19,7 +19,7 @@ program reduce
 
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
 ! CHECK:           omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
index 9af18378f0ae0..da337378862be 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction3.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
 ! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
index 8b94d51f986f5..4a0593ff9eca4 100644
--- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
+++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
@@ -9,7 +9,7 @@ subroutine max_array_reduction(l, r)
   !$omp end parallel
 end subroutine
 
-! CHECK-LABEL:   omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
 ! CHECK:           omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90
index 2f2808cebfc0c..0dbe9e3673395 100644
--- a/flang/test/Lower/OpenMP/sections-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90
@@ -14,7 +14,7 @@ subroutine sectionsReduction(x)
 end subroutine
 
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc {
 ! [...]
 ! CHECK:           omp.yield
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
index 18a4f75b86309..3a63bb09c59de 100644
--- a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
@@ -1,7 +1,7 @@
 ! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
 
-! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc {
+! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc {
 !                 [...]
 ! CHECK:          omp.yield
 ! CHECK-LABEL:  } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index 2cd953de0dffa..ed81577ecce16 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -32,7 +32,7 @@ program reduce15
   print *,"min: ", mins
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:           omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK-LABEL:   } init {
@@ -93,7 +93,7 @@ program reduce15
 ! CHECK:           omp.yield
 ! CHECK:         }
 
-! CHECK-LABEL:   omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK:           omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
index 663851cba46c6..d8c0a36db126e 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
@@ -18,7 +18,7 @@ program reduce
 
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> attributes {byref_element_type = i32} alloc {
 ! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
 ! CHECK:           omp.yield(%[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index 209ee9a4e0cef..28acb8f19531f 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -22,7 +22,7 @@ subroutine reduce(r)
 end subroutine
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
 ! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
index 2233a74600948..ec448cf20f111 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
@@ -11,7 +11,7 @@ program reduce
   !$omp end parallel do
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc {
 ! CHECK:         } combiner {
 ! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>):
 ! CHECK:           %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
index 211bde19da8db..9da05a290ec21 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
@@ -19,7 +19,7 @@ subroutine sub(a, lb, ub)
 
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc {
 ! CHECK:         } combiner {
 ! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>):
 ! CHECK:           %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index afaeba27c5eae..14b657c8e180d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -14,7 +14,7 @@ program reduce
 print *,r
 end program
 
-! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc {
+! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> attributes {byref_element_type = !fir.array<2xi32>} alloc {
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
 ! CHECK-LABEL:  } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index 25b2e97a1b7f7..d0a0c38e4ccb1 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -14,7 +14,7 @@ program reduce
 print *,r
 end program
 
-! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc {
+! CHECK-LABEL:  omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
 ! CHECK:           omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
 ! CHECK-LABEL:  } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index edd2bcb1d6be8..60a162d8f8002 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -24,7 +24,7 @@ program main
 
 endprogram
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>>
 ! CHECK:           omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
index 27b726376fbeb..f640f5caddf76 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
@@ -18,7 +18,7 @@ program reduce_pointer
   deallocate(v)
 end program
 
-! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> alloc {
+! CHECK-LABEL:   omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> {{.*}} alloc {
 ! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
 ! CHECK:           omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK-LABEL:   } init {
diff --git a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 b/flang/test/Lower/do_concurrent_reduce_allocatable.f90
index 873fd10dd1b97..4fb67c094b594 100644
--- a/flang/test/Lower/do_concurrent_reduce_allocatable.f90
+++ b/flang/test/Lower/do_concurrent_reduce_allocatable.f90
@@ -8,7 +8,7 @@ subroutine do_concurrent_allocatable
   end do
 end subroutine
 
-! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] alloc {
+! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] attributes {byref_element_type = !fir.array<?x?xf32>} alloc {
 ! CHECK:   %[[ALLOC:.*]] = fir.alloca
 ! CHECK:   fir.yield(%[[ALLOC]] : ![[RED_TYPE]])
 ! CHECK: } init {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5331cb5abdc6f..dff376f3a22e4 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1448,17 +1448,22 @@ class OpenMPIRBuilder {
     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
                   EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
                   ReductionGenClangCBTy ReductionGenClang,
-                  ReductionGenAtomicCBTy AtomicReductionGen)
+                  ReductionGenAtomicCBTy AtomicReductionGen,
+                  Type *ByRefAllocatedType = nullptr,
+                  Type *ByRefElementType = nullptr)
         : ElementType(ElementType), Variable(Variable),
           PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
           ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
-          AtomicReductionGen(AtomicReductionGen) {}
+          AtomicReductionGen(AtomicReductionGen),
+          ByRefAllocatedType(ByRefAllocatedType),
+          ByRefElementType(ByRefElementType) {}
     ReductionInfo(Value *PrivateVariable)
         : ElementType(nullptr), Variable(nullptr),
           PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
           ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
 
-    /// Reduction element type, must match pointee type of variable.
+    /// Reduction element type, must match pointee type of variable. For by-ref
+    /// reductions, this would be just an opaque `ptr`.
     Type *ElementType;
 
     /// Reduction variable of pointer type.
@@ -1485,6 +1490,18 @@ class OpenMPIRBuilder {
     /// reduction. If null, the implementation will use the non-atomic version
     /// along with the appropriate synchronization mechanisms.
     ReductionGenAtomicCBTy AtomicReductionGen;
+
+    /// For by-ref reductions, we need to keep track of 2 extra types that are
+    /// potentially different:
+    /// * The allocated type is the type of the storage allocated by the
+    /// reduction op's `alloc` region. For example, for allocatables and arrays,
+    /// this type would be the descriptor/box struct.
+    Type *ByRefAllocatedType;
+    /// * The by-ref element type is the type of the actual storage needed for
+    /// the data of the allocatable or array. For example, an float allocatable
+    /// of would need some float storage to store intermediate reduction
+    /// results.
+    Type *ByRefElementType;
   };
 
   enum class CopyAction : unsigned {
@@ -1529,14 +1546,15 @@ class OpenMPIRBuilder {
 
   /// Function to shuffle over the value from the remote lane.
   void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
-                       Type *ElementType, Value *Offset,
-                       Type *ReductionArrayTy);
+                       Type *ElementType, Value *Offset, Type *ReductionArrayTy,
+                       bool IsByRefElem);
 
   /// Emit instructions to copy a Reduce list, which contains partially
   /// aggregated values, in the specified direction.
   void emitReductionListCopy(
       InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
       ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+      ArrayRef<bool> IsByRef,
       CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
 
   /// Emit a helper that reduces data across two OpenMP threads (lanes)
@@ -1610,11 +1628,13 @@ class OpenMPIRBuilder {
   /// \param ReduceFn The reduction function.
   /// \param FuncAttrs Optional param to specify any function attributes that
   ///                  need to be copied to the new function.
+  /// \param IsByRef For each reduction clause, whether the reduction is by-ref
+  ///                  or not.
   ///
   /// \return The ShuffleAndReduce function.
   Function *emitShuffleAndReduceFunction(
       ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
-      Function *ReduceFn, AttributeList FuncAttrs);
+      Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
 
   /// Helper function for CreateCanonicalScanLoops to create InputLoop
   /// in the firstGen and Scan Loop in the SecondGen
@@ -1674,12 +1694,14 @@ class OpenMPIRBuilder {
   /// \param ReductionInfos Array type containing the ReductionOps.
   /// \param FuncAttrs Optional param to specify any function attributes that
   ///                  need to be copied to the new function.
+  /// \param IsByRef For each reduction clause, whether the reduction is by-ref
+  ///                  or not.
   ///
   /// \return The InterWarpCopy function.
   Expected<Function *>
   emitInterWarpCopyFunction(const LocationDescription &Loc,
                             ArrayRef<ReductionInfo> ReductionInfos,
-                            AttributeList FuncAttrs);
+                            AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
 
   /// This function emits a helper that copies all the reduction variables from
   /// the team into the provided global buffer for the reduction variables.
@@ -1773,6 +1795,7 @@ class OpenMPIRBuilder {
   /// \return The reduction function.
   Expected<Function *> createReductionFunction(
       StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+      ArrayRef<bool> IsByRef,
       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
       AttributeList FuncAttrs = {});
 
@@ -2025,11 +2048,14 @@ class OpenMPIRBuilder {
   ///                           reduction variables.
   /// \param AllocaIP           An insertion point suitable for allocas usable
   ///                           in reductions.
-  /// \param CodeGenIP           An insertion point suitable for code
-  /// generation. \param ReductionInfos     A list of info on each reduction
-  /// variable. \param IsNoWait           Optional flag set if the reduction is
-  /// marked as
-  ///                           nowait.
+  /// \param CodeGenIP          An insertion point suitable for code
+  ///                           generation.
+  /// \param ReductionInfos     A list of info on each reduction
+  ///                           variable.
+  /// \param IsNoWait           Optional flag set if the reduction is
+  ///                           marked as nowait.
+  /// \param IsByRef For each reduction clause, whether the reduction is by-ref
+  ///                  or not.
   /// \param IsTeamsReduction   Optional flag set if it is a teams
   ///                           reduction.
   /// \param GridValue          Optional GPU grid value.
@@ -2039,7 +2065,8 @@ class OpenMPIRBuilder {
   LLVM_ABI InsertPointOrErrorTy createReductionsGPU(
       const LocationDescription &Loc, InsertPointTy AllocaIP,
       InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
-      bool IsNoWait = false, bool IsTeamsReduction = false,
+      ArrayRef<bool> IsByRef, bool IsNoWait = false,
+      bool IsTeamsReduction = false,
       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
       std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
       Value *SrcLocInfo = nullptr);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0e5926ff0fb18..db25b6ea357ce 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2450,7 +2450,8 @@ Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
 
 void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
                                       Value *DstAddr, Type *ElemType,
-                                      Value *Offset, Type *ReductionArrayTy) {
+                                      Value *Offset, Type *ReductionArrayTy,
+                                      bool IsByRefElem) {
   uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
   // Create the loop over the big sized data.
   // ptr = (void*)Elem;
@@ -2535,7 +2536,7 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
 void OpenMPIRBuilder::emitReductionListCopy(
     InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
     ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
-    CopyOptionsTy CopyOptions) {
+    ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
   Type *IndexTy = Builder.getIndexTy(
       M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
   Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
@@ -2545,6 +2546,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
   for (auto En : enumerate(ReductionInfos)) {
     const ReductionInfo &RI = En.value();
     Value *SrcElementAddr = nullptr;
+    AllocaInst *DestAlloca = nullptr;
     Value *DestElementAddr = nullptr;
     Value *DestElementPtrAddr = nullptr;
     // Should we shuffle in an element from a remote lane?
@@ -2564,14 +2566,18 @@ void OpenMPIRBuilder::emitReductionListCopy(
     DestElementPtrAddr = Builder.CreateInBoundsGEP(
         ReductionArrayTy, DestBase,
         {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
     switch (Action) {
     case CopyAction::RemoteLaneToThread: {
       InsertPointTy CurIP = Builder.saveIP();
       Builder.restoreIP(AllocaIP);
-      AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
-                                                    ".omp.reduction.element");
+
+      Type *DestAllocaType =
+          IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
+      DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
+                                        ".omp.reduction.element");
       DestAlloca->setAlignment(
-          M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+          M.getDataLayout().getPrefTypeAlign(DestAllocaType));
       DestElementAddr = DestAlloca;
       DestElementAddr =
           Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
@@ -2591,8 +2597,49 @@ void OpenMPIRBuilder::emitReductionListCopy(
     // Now that all active lanes have read the element in the
     // Reduce list, shuffle over the value from the remote lane.
     if (ShuffleInElement) {
-      shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
-                      RemoteLaneOffset, ReductionArrayTy);
+      Type *ShuffleType = RI.ElementType;
+      Value *ShuffleSrcAddr = SrcElementAddr;
+      Value *ShuffleDestAddr = DestElementAddr;
+      Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0);
+      AllocaInst *LocalStorage = nullptr;
+
+      if (IsByRefElem) {
+        assert(RI.ByRefElementType && "Expected by-ref element type to be set");
+        assert(RI.ByRefAllocatedType &&
+               "Expected by-ref allocated type to be set");
+        // For by-ref reductions, we need to copy from the remote lane the
+        // actual value of the partial reduction computed by that remote lane;
+        // rather than, for example, a pointer to that data or, even worse, a
+        // pointer to the descriptor of the by-ref reduction element.
+        ShuffleType = RI.ByRefElementType;
+
+        ShuffleSrcAddr = Builder.CreateGEP(RI.ByRefAllocatedType,
+                                           ShuffleSrcAddr, {Zero, Zero});
+        ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
+
+        {
+          auto OldIP = Builder.saveIP();
+          Builder.restoreIP(AllocaIP);
+
+          LocalStorage = Builder.CreateAlloca(ShuffleType);
+          Builder.restoreIP(OldIP);
+          ShuffleDestAddr = LocalStorage;
+        }
+      }
+
+      shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
+                      RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
+
+      if (IsByRefElem) {
+        auto *GEP =
+            Builder.CreateGEP(RI.ByRefAllocatedType,
+                              Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                  DestAlloca, Builder.getPtrTy(), ".ascast"),
+                              {Zero, Zero});
+        Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
+                                LocalStorage, Builder.getPtrTy(), ".ascast"),
+                            GEP);
+      }
     } else {
       switch (RI.EvaluationKind) {
       case EvalKind::Scalar: {
@@ -2647,7 +2694,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
 
 Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
     const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
-    AttributeList FuncAttrs) {
+    AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
   InsertPointTy SavedIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy = FunctionType::get(
@@ -2728,7 +2775,9 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
     // memory.
     //
     const ReductionInfo &RI = En.value();
-    unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
+    bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
+    unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
+        IsByRefElem ? RI.ByRefElementType : RI.ElementType);
     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
       Type *CType = Builder.getIntNTy(TySize * 8);
 
@@ -2791,6 +2840,15 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
                                      ConstantInt::get(IndexTy, En.index())});
       // elemptr = ((CopyType*)(elemptrptr)) + I
       Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+      if (IsByRefElem) {
+        Type *Int32Ty = Builder.getInt32Ty();
+        Constant *Zero = ConstantInt::get(Int32Ty, 0);
+        ElemPtr =
+            Builder.CreateGEP(RI.ByRefAllocatedType, ElemPtr, {Zero, Zero});
+        ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
+      }
+
       if (NumIters > 1)
         ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
 
@@ -2846,6 +2904,15 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
       Value *TargetElemPtrVal =
           Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
       Value *TargetElemPtr = TargetElemPtrVal;
+
+      if (IsByRefElem) {
+        Type *Int32Ty = Builder.getInt32Ty();
+        Constant *Zero = ConstantInt::get(Int32Ty, 0);
+        TargetElemPtr = Builder.CreateGEP(RI.ByRefAllocatedType, TargetElemPtr,
+                                          {Zero, Zero});
+        TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
+      }
+
       if (NumIters > 1)
         TargetElemPtr =
             Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
@@ -2882,7 +2949,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
 
 Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
     ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
-    AttributeList FuncAttrs) {
+    AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
   LLVMContext &Ctx = M.getContext();
   FunctionType *FuncTy =
       FunctionType::get(Builder.getVoidTy(),
@@ -2961,9 +3028,10 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
   // This loop iterates through the list of reduce elements and copies,
   // element by element, from a remote lane in the warp to RemoteReduceList,
   // hosted on the thread's stack.
-  emitReductionListCopy(
-      AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
-      ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+  emitReductionListCopy(AllocaIP, CopyAction::RemoteLaneToThread,
+                        RedListArrayTy, ReductionInfos, ReduceList,
+                        RemoteListAddrCast, IsByRef,
+                        {RemoteLaneOffset, nullptr, nullptr});
 
   // The actions to be performed on the Remote Reduce list is dependent
   // on the algorithm version.
@@ -3032,7 +3100,8 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
 
   emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
   emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
-                        ReductionInfos, RemoteListAddrCast, ReduceList);
+                        ReductionInfos, RemoteListAddrCast, ReduceList,
+                        IsByRef);
   Builder.CreateBr(CpyMergeBB);
 
   emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
@@ -3437,7 +3506,8 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
 
 Expected<Function *> OpenMPIRBuilder::createReductionFunction(
     StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
-    ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
+    ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind,
+    AttributeList FuncAttrs) {
   auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
                                    {Builder.getPtrTy(), Builder.getPtrTy()},
                                    /* IsVarArg */ false);
@@ -3498,8 +3568,14 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
       LHSPtrs.emplace_back(LHSPtr);
       RHSPtrs.emplace_back(RHSPtr);
     } else {
-      Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
-      Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+      Value *LHS = LHSPtr;
+      Value *RHS = RHSPtr;
+
+      if (!IsByRef.empty() && !IsByRef[En.index()]) {
+        LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+        RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+      }
+
       Value *Reduced;
       InsertPointOrErrorTy AfterIP =
           RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
@@ -3509,7 +3585,9 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
         return ReductionFunc;
 
       Builder.restoreIP(*AfterIP);
-      Builder.CreateStore(Reduced, LHSPtr);
+
+      if (!IsByRef.empty() && !IsByRef[En.index()])
+        Builder.CreateStore(Reduced, LHSPtr);
     }
   }
 
@@ -3562,9 +3640,9 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
-    bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
-    std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
-    Value *SrcLocInfo) {
+    ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
+    ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
+    unsigned ReductionBufNum, Value *SrcLocInfo) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
   Builder.restoreIP(CodeGenIP);
@@ -3600,9 +3678,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
   FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
 
   CodeGenIP = Builder.saveIP();
-  Expected<Function *> ReductionResult =
-      createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
-                              ReductionInfos, ReductionGenCBKind, FuncAttrs);
+  Expected<Function *> ReductionResult = createReductionFunction(
+      Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
+      ReductionGenCBKind, FuncAttrs);
   if (!ReductionResult)
     return ReductionResult.takeError();
   Function *ReductionFunc = *ReductionResult;
@@ -3641,15 +3719,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
     Value *ElemPtr = Builder.CreateInBoundsGEP(
         RedArrayTy, ReductionList,
         {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+
+    auto *PrviateVar = RI.PrivateVariable;
+    bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
+    if (IsByRefElem)
+      PrviateVar = Builder.CreateLoad(RI.ElementType, PrviateVar);
+
     Value *CastElem =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+        Builder.CreatePointerBitCastOrAddrSpaceCast(PrviateVar, PtrTy);
     Builder.CreateStore(CastElem, ElemPtr);
   }
   CodeGenIP = Builder.saveIP();
-  Function *SarFunc =
-      emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+  Function *SarFunc = emitShuffleAndReduceFunction(
+      ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
   Expected<Function *> CopyResult =
-      emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+      emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
   if (!CopyResult)
     return CopyResult.takeError();
   Function *WcFunc = *CopyResult;
@@ -3728,7 +3812,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
   for (auto En : enumerate(ReductionInfos)) {
     const ReductionInfo &RI = En.value();
-    Value *LHS = RI.Variable;
+    Type *ValueType = RI.ElementType;
+    Value *RedValue = RI.Variable;
     Value *RHS =
         Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
 
@@ -3739,7 +3824,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
 
       // Fix the CallBack code genereated to use the correct Values for the LHS
       // and RHS
-      LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+      LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
         return cast<Instruction>(U.getUser())->getParent()->getParent() ==
                ReductionFunc;
       });
@@ -3748,15 +3833,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
                ReductionFunc;
       });
     } else {
-      Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
-      Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
+      if (IsByRef.empty() || !IsByRef[En.index()]) {
+        RedValue = Builder.CreateLoad(ValueType, RI.Variable,
+                                      "red.value." + Twine(En.index()));
+      }
+      Value *PrivateRedValue = Builder.CreateLoad(
+          ValueType, RHS, "red.private.value" + Twine(En.index()));
       Value *Reduced;
       InsertPointOrErrorTy AfterIP =
-          RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
+          RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
       if (!AfterIP)
         return AfterIP.takeError();
       Builder.restoreIP(*AfterIP);
-      Builder.CreateStore(Reduced, LHS, false);
+
+      if (!IsByRef.empty() && !IsByRef[En.index()])
+        Builder.CreateStore(Reduced, RI.Variable);
     }
   }
   emitBlock(ExitBB, CurFunc);
@@ -3857,7 +3948,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
   assert(ReductionInfos.size() == IsByRef.size());
   if (Config.isGPU())
     return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
-                               IsNoWait, IsTeamsReduction);
+                               IsByRef, IsNoWait, IsTeamsReduction);
 
   checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 377f1febf6b8f..386174a36d52c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2011,7 +2011,9 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove,
   }];
 
   let arguments = (ins SymbolNameAttr:$sym_name,
-                       TypeAttr:$type);
+                       TypeAttr:$type,
+                       OptionalAttr<TypeAttr>:$byref_element_type
+                       );
 
   let regions = (region MaxSizedRegion<1>:$allocRegion,
                         AnyRegion:$initializerRegion,
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 460595ba9f254..6423d49859c97 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -188,7 +188,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable,
   OpBuilder::InsertionGuard guard(builder);
   Type type = reduce.getOperands()[reductionIndex].getType();
   auto decl = omp::DeclareReductionOp::create(builder, reduce.getLoc(),
-                                              "__scf_reduction", type);
+                                              "__scf_reduction", type,
+                                              /*byref_element_type=*/{});
   symbolTable.insert(decl);
 
   builder.createBlock(&decl.getInitializerRegion(),
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8edec990eaaba..d0852b52f4193 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1311,7 +1311,8 @@ static void collectReductionInfo(
     SmallVectorImpl<OwningReductionGen> &owningReductionGens,
     SmallVectorImpl<OwningAtomicReductionGen> &owningAtomicReductionGens,
     const ArrayRef<llvm::Value *> privateReductionVariables,
-    SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) {
+    SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos,
+    ArrayRef<bool> isByRef) {
   unsigned numReductions = loop.getNumReductionVars();
 
   for (unsigned i = 0; i < numReductions; ++i) {
@@ -1329,12 +1330,27 @@ static void collectReductionInfo(
       atomicGen = owningAtomicReductionGens[i];
     llvm::Value *variable =
         moduleTranslation.lookupValue(loop.getReductionVars()[i]);
+    mlir::Type allocatedType;
+    reductionDecls[i].getAllocRegion().walk([&](mlir::Operation *op) {
+      if (auto alloca = mlir::dyn_cast<LLVM::AllocaOp>(op)) {
+        allocatedType = alloca.getElemType();
+        return mlir::WalkResult::interrupt();
+      }
+
+      return mlir::WalkResult::advance();
+    });
+
     reductionInfos.push_back(
         {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
          privateReductionVariables[i],
          /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
          owningReductionGens[i],
-         /*ReductionGenClang=*/nullptr, atomicGen});
+         /*ReductionGenClang=*/nullptr, atomicGen,
+         allocatedType ? moduleTranslation.convertType(allocatedType) : nullptr,
+         reductionDecls[i].getByrefElementType()
+             ? moduleTranslation.convertType(
+                   *reductionDecls[i].getByrefElementType())
+             : nullptr});
   }
 }
 
@@ -1400,7 +1416,7 @@ static LogicalResult createReductionsAndCleanup(
   // ReductionInfo only accepts references to the generators.
   collectReductionInfo(op, builder, moduleTranslation, reductionDecls,
                        owningReductionGens, owningAtomicReductionGens,
-                       privateReductionVariables, reductionInfos);
+                       privateReductionVariables, reductionInfos, isByRef);
 
   // The call to createReductions below expects the block to have a
   // terminator. Create an unreachable instruction to serve as terminator
@@ -2732,7 +2748,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
       collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls,
                            owningReductionGens, owningAtomicReductionGens,
-                           privateReductionVariables, reductionInfos);
+                           privateReductionVariables, reductionInfos, isByRef);
 
       // Move to region cont block
       builder.SetInsertPoint((*regionBlock)->getTerminator());
diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
new file mode 100644
index 0000000000000..af3f5e68b6ddb
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
@@ -0,0 +1,92 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+  omp.private {type = private} @_QFfooEi_private_i32 : i32
+  omp.declare_reduction @add_reduction_byref_box_heap_f32 : !llvm.ptr attributes {byref_element_type = f32} alloc {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    omp.yield(%arg1 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+    %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+    %3 = llvm.mlir.constant(1 : i32) : i32
+    %4 = llvm.alloca %3 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+    %6 = llvm.mlir.constant(24 : i32) : i32
+    "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    %7 = llvm.mlir.constant(24 : i32) : i32
+    "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr
+    %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+    %12 = llvm.load %9 : !llvm.ptr -> f32
+    %13 = llvm.load %11 : !llvm.ptr -> f32
+    %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath<contract>} : f32
+    llvm.store %14, %9 : f32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  }
+  llvm.func @foo_() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5>
+    %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+    %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+    %9 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%8 : !llvm.ptr) -> !llvm.ptr {name = ""}
+    %10 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members(%9 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar_alloc"}
+    omp.target map_entries(%10 -> %arg0 : !llvm.ptr) {
+      %13 = llvm.mlir.constant(1000 : i32) : i32
+      %14 = llvm.mlir.constant(1 : i32) : i32
+      omp.parallel {
+        omp.wsloop reduction(byref @add_reduction_byref_box_heap_f32 %arg0 -> %arg4 : !llvm.ptr) {
+          omp.loop_nest (%arg5) : i32 = (%14) to (%13) inclusive step (%14) {
+            omp.yield
+          }
+        }
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} {
+// CHECK:   %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK:   %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5)
+// CHECK:   %[[RED_ELEM_1:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr
+
+// CHECK:   %[[SHUFFLE_ELEM:.*]] = alloca float, align 4, addrspace(5)
+// CHECK:   %[[REMOTE_RED_LIST_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[REMOTE_RED_LIST]] to ptr
+
+// CHECK:   %[[REMOTE_RED_LIST_ELEM0:.*]] = getelementptr inbounds [1 x ptr], ptr %[[REMOTE_RED_LIST_ASCAST]], i64 0, i64 0
+
+// CHECK:   %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr
+// CHECK:   %[[SHUFFLE_RES:.*]] = call i32 @__kmpc_shuffle_int32({{.*}})
+// CHECK:   store i32 %[[SHUFFLE_RES]], ptr %[[SHUFFLE_ELEM_ASCAST]], align 4
+
+// CHECK:   %[[RED_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr
+// CHECK:   %[[RED_ALLOC_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_ASCAST]], i32 0, i32 0
+// CHECK:   %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr
+// CHECK:   store ptr %[[SHUFFLE_ELEM_ASCAST]], ptr %[[RED_ALLOC_PTR]], align 8
+// CHECK:   store ptr %[[RED_ELEM_1]], ptr %[[REMOTE_RED_LIST_ELEM0]], align 8
+// CHECK: }
+
+// CHECK: define {{.*}} @_omp_reduction_inter_warp_copy_func({{.*}}) {{.*}} {
+// CHECK:   %[[WARP_MASTER_CMP:.*]] = icmp eq i32 %nvptx_lane_id, 0
+// CHECK:   br i1 %[[WARP_MASTER_CMP]], label %[[WARP_MASTER_BB:.*]], label %{{.*}}
+
+// CHECK: [[WARP_MASTER_BB]]:
+// CHECK:   %[[WARP_RESULT_PTR:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0
+// CHECK:   %[[WARP_RESULT:.*]] = load ptr, ptr %[[WARP_RESULT_PTR]], align 8
+// CHECK:   %[[ALLOC_MEM_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[WARP_RESULT]], i32 0, i32 0
+// CHECK:   %[[ALLOC_MEM:.*]] = load ptr, ptr %[[ALLOC_MEM_PTR]], align 8
+// CHECK:   %[[WARP_TRANSFER_SLOT:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %nvptx_warp_id
+// CHECK:   %[[WARP_RED_RES:.*]] = load i32, ptr %[[ALLOC_MEM]], align 4
+// CHECK:   store volatile i32 %[[WARP_RED_RES]], ptr addrspace(3) %[[WARP_TRANSFER_SLOT]], align 4
+// CHECK: }
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
index 87ff0ba786648..08a738c8fe4c6 100644
--- a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
@@ -7,7 +7,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 :
   llvm.func @bar() {}
   llvm.func @baz() {}
 
-  omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc {
+  omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr attributes {byref_element_type = !llvm.array<5 x f32>} alloc {
     %0 = llvm.mlir.constant(1 : i64) : i64
     %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5>
     %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
@@ -67,9 +67,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 :
 // CHECK:     br label %[[CONT_BB:.*]]
 
 // CHECK:   [[CONT_BB]]:
-// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %final.rhs, %{{.*}} ]
-// CHECK-NEXT: store ptr %[[RED_RHS]], ptr %{{.*}}, align 8
-// CHECK-NEXT: br label %.omp.reduction.done
+// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %{{.*}}, %{{.*}} ]
 // CHECK: }
 
 // CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 {
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
index b8b7c780a74d0..8950db3fc48aa 100644
--- a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
@@ -109,19 +109,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK: icmp eq i32 %[[MASTER]], 1
 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
 // CHECK: [[THEN]]:
-// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double
 // CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double
 // CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]]
 // CHECK-NEXT: store double %[[FINAL_RESULT0]]
-// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double
 // CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double
 // CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]]
 // CHECK-NEXT: store double %[[FINAL_RESULT1]]
-// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float
 // CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float
 // CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]]
 // CHECK-NEXT: store float %[[FINAL_RESULT2]]
-// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float
 // CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float
 // CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]]
 // CHECK-NEXT: store float %[[FINAL_RESULT3]]
diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir
index 9aba72dabf13c..b7cb1026967f3 100644
--- a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir
@@ -59,8 +59,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK: call void @__kmpc_barrier
 
 // CHECK: [[THEN]]:
-// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32
+// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]]
 // CHECK-NEXT: store i32 %[[FINAL_RESULT]]
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
index dc22fe11666cf..36eb280dfcfa2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
@@ -62,8 +62,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // CHECK: icmp eq i32 %[[MASTER]], 1
 // CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
 // CHECK: [[THEN]]:
-// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
 // CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32
+// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
 // CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]]
 // CHECK-NEXT: store i32 %[[FINAL_RESULT]]
 



More information about the cfe-commits mailing list