[clang] [flang] [llvm] [mlir] [OpenMP][flang] Add initial support for by-ref reductions on the GPU (PR #165714)
Kareem Ergawy via cfe-commits
cfe-commits at lists.llvm.org
Thu Oct 30 07:55:55 PDT 2025
https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/165714
>From 23caa561392cf774f098f3d6396719bdc1a0af20 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Fri, 17 Oct 2025 08:35:07 -0500
Subject: [PATCH] [OpenMP][flang] Add initial support for by-ref reductions on
the GPU
Adds initial support for GPU by-ref reductions. In particular, this diff
adds support for reductions on scalar allocatables where reductions
happen on loops nested in `target` regions. For example:
```fortran
integer :: i
real, allocatable :: scalar_alloc
allocate(scalar_alloc)
scalar_alloc = 0
!$omp target map(tofrom: scalar_alloc)
!$omp parallel do reduction(+: scalar_alloc)
do i = 1, 1000000
scalar_alloc = scalar_alloc + 1
end do
!$omp end target
```
This PR supports by-ref reductions on the intra- and inter-warp levels.
So far, there are still steps to be takens for full support of by-ref
reductions, for example:
* Support inter-block value combination is still not supported.
Therefore, `target teams distribute parallel do` is still not
supported.
* Support for dynamically-sized arrays still needs to be added.
* Support for more than one allocatable/array on the same `reduction`
clause.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 4 +-
.../include/flang/Optimizer/Dialect/FIROps.td | 3 +-
.../lib/Lower/Support/ReductionProcessor.cpp | 7 +-
.../OpenMP/DoConcurrentConversion.cpp | 3 +-
.../delayed-privatization-reduction-byref.f90 | 2 +-
.../parallel-reduction-allocatable-array.f90 | 2 +-
.../OpenMP/parallel-reduction-array-lb.f90 | 2 +-
.../Lower/OpenMP/parallel-reduction-array.f90 | 2 +-
.../OpenMP/parallel-reduction-array2.f90 | 2 +-
.../parallel-reduction-pointer-array.f90 | 2 +-
.../test/Lower/OpenMP/parallel-reduction3.f90 | 2 +-
.../OpenMP/reduction-array-intrinsic.f90 | 2 +-
.../Lower/OpenMP/sections-array-reduction.f90 | 2 +-
.../OpenMP/taskgroup-task-array-reduction.f90 | 2 +-
...oop-reduction-allocatable-array-minmax.f90 | 4 +-
.../OpenMP/wsloop-reduction-allocatable.f90 | 2 +-
.../wsloop-reduction-array-assumed-shape.f90 | 2 +-
.../OpenMP/wsloop-reduction-array-lb.f90 | 2 +-
.../OpenMP/wsloop-reduction-array-lb2.f90 | 2 +-
.../Lower/OpenMP/wsloop-reduction-array.f90 | 2 +-
.../Lower/OpenMP/wsloop-reduction-array2.f90 | 2 +-
.../wsloop-reduction-multiple-clauses.f90 | 2 +-
.../Lower/OpenMP/wsloop-reduction-pointer.f90 | 2 +-
.../do_concurrent_reduce_allocatable.f90 | 2 +-
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 24 ++-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 156 ++++++++++++++----
mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 4 +-
.../Conversion/SCFToOpenMP/SCFToOpenMP.cpp | 3 +-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 24 ++-
.../LLVMIR/allocatable_gpu_reduction.mlir | 92 +++++++++++
.../omptarget-multi-block-reduction.mlir | 6 +-
.../LLVMIR/omptarget-multi-reduction.mlir | 8 +-
.../omptarget-teams-distribute-reduction.mlir | 2 +-
.../LLVMIR/omptarget-teams-reduction.mlir | 2 +-
34 files changed, 296 insertions(+), 84 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index fddeba98adccc..9a8c75073aa4c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1784,8 +1784,8 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
cantFail(OMPBuilder.createReductionsGPU(
- OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
- llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+ OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, {}, false,
+ TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
CGF.getTarget().getGridValue(),
C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc));
CGF.Builder.restoreIP(AfterIP);
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 58a317cf5d691..ff4dab1136ee9 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3743,7 +3743,8 @@ def fir_DeclareReductionOp : fir_Op<"declare_reduction", [IsolatedFromAbove,
}];
let arguments = (ins SymbolNameAttr:$sym_name,
- TypeAttr:$type);
+ TypeAttr:$type,
+ OptionalAttr<TypeAttr>:$byref_element_type);
let regions = (region MaxSizedRegion<1>:$allocRegion,
AnyRegion:$initializerRegion,
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 605a5b6b20b94..e02cd8fac823b 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -573,10 +573,15 @@ OpType ReductionProcessor::createDeclareReduction(
mlir::OpBuilder modBuilder(module.getBodyRegion());
mlir::Type valTy = fir::unwrapRefType(type);
+ mlir::TypeAttr boxedTy{};
+
if (!isByRef)
type = valTy;
- decl = OpType::create(modBuilder, loc, reductionOpName, type);
+ if (isByRef)
+ boxedTy = mlir::TypeAttr::get(fir::unwrapPassByRefType(valTy));
+
+ decl = OpType::create(modBuilder, loc, reductionOpName, type, boxedTy);
createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
isByRef);
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 1229018bd9b3e..11609ea7b6040 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -851,7 +851,8 @@ class DoConcurrentConversion
if (!ompReducer) {
ompReducer = mlir::omp::DeclareReductionOp::create(
rewriter, firReducer.getLoc(), ompReducerName,
- firReducer.getTypeAttr().getValue());
+ firReducer.getTypeAttr().getValue(),
+ firReducer.getByrefElementTypeAttr());
cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
ompReducer.getAllocRegion());
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
index 4b6a643f94059..4c7b6ac5f5f9b 100644
--- a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
@@ -22,7 +22,7 @@ subroutine red_and_delayed_private
! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : i32
! CHECK-LABEL: omp.declare_reduction
-! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> alloc
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> attributes {byref_element_type = i32} alloc
! CHECK-LABEL: _QPred_and_delayed_private
! CHECK: omp.parallel
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
index 41c7d69ebb3ba..f56875dcb518b 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90
@@ -18,7 +18,7 @@ program reduce
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc {
! CHECK: %[[VAL_10:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
! CHECK: omp.yield(%[[VAL_10]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
index aa91e1e0e8b15..d9ba3bed464f8 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90
@@ -12,7 +12,7 @@ program reduce
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x2xi32 : !fir.ref<!fir.box<!fir.array<3x2xi32>>> {{.*}} alloc {
! CHECK: %[[VAL_15:.*]] = fir.alloca !fir.box<!fir.array<3x2xi32>>
! CHECK: omp.yield(%[[VAL_15]] : !fir.ref<!fir.box<!fir.array<3x2xi32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
index 59595de338d50..636660f279e85 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array.f90
@@ -17,7 +17,7 @@ program reduce
print *,i
end program
-! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
+! CPU-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> attributes {byref_element_type = !fir.array<3xi32>} alloc {
! CPU: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
! CPU: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
! CPU-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
index 14338c6f50817..9cf8a63427ed1 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90
@@ -13,7 +13,7 @@ program reduce
print *,i
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3xi32 : !fir.ref<!fir.box<!fir.array<3xi32>>> {{.*}} alloc {
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<3xi32>>
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<3xi32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
index 36344458d1cae..3de2ba8f61f8e 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90
@@ -19,7 +19,7 @@ program reduce
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_Uxi32 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> attributes {byref_element_type = !fir.array<?xi32>} alloc {
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction3.f90 b/flang/test/Lower/OpenMP/parallel-reduction3.f90
index 9af18378f0ae0..da337378862be 100644
--- a/flang/test/Lower/OpenMP/parallel-reduction3.f90
+++ b/flang/test/Lower/OpenMP/parallel-reduction3.f90
@@ -1,7 +1,7 @@
! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc {
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90 b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
index 8b94d51f986f5..4a0593ff9eca4 100644
--- a/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
+++ b/flang/test/Lower/OpenMP/reduction-array-intrinsic.f90
@@ -9,7 +9,7 @@ subroutine max_array_reduction(l, r)
!$omp end parallel
end subroutine
-! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @max_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc {
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<?xi32>>
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<?xi32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/sections-array-reduction.f90 b/flang/test/Lower/OpenMP/sections-array-reduction.f90
index 2f2808cebfc0c..0dbe9e3673395 100644
--- a/flang/test/Lower/OpenMP/sections-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/sections-array-reduction.f90
@@ -14,7 +14,7 @@ subroutine sectionsReduction(x)
end subroutine
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc {
! [...]
! CHECK: omp.yield
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90 b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
index 18a4f75b86309..3a63bb09c59de 100644
--- a/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
+++ b/flang/test/Lower/OpenMP/taskgroup-task-array-reduction.f90
@@ -1,7 +1,7 @@
! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf32 : !fir.ref<!fir.box<!fir.array<?xf32>>> {{.*}} alloc {
! [...]
! CHECK: omp.yield
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
index 2cd953de0dffa..ed81577ecce16 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90
@@ -32,7 +32,7 @@ program reduce15
print *,"min: ", mins
end program
-! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @min_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc {
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
! CHECK-LABEL: } init {
@@ -93,7 +93,7 @@ program reduce15
! CHECK: omp.yield
! CHECK: }
-! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @max_byref_box_heap_Uxi32 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {{.*}} alloc {
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
index 663851cba46c6..d8c0a36db126e 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90
@@ -18,7 +18,7 @@ program reduce
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_heap_i32 : !fir.ref<!fir.box<!fir.heap<i32>>> attributes {byref_element_type = i32} alloc {
! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
! CHECK: omp.yield(%[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<i32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
index 209ee9a4e0cef..28acb8f19531f 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-assumed-shape.f90
@@ -22,7 +22,7 @@ subroutine reduce(r)
end subroutine
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxf64 : !fir.ref<!fir.box<!fir.array<?xf64>>> {{.*}} alloc {
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<?xf64>>
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<?xf64>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
index 2233a74600948..ec448cf20f111 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90
@@ -11,7 +11,7 @@ program reduce
!$omp end parallel do
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc {
! CHECK: } combiner {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<2xi32>>>):
! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<2xi32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
index 211bde19da8db..9da05a290ec21 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90
@@ -19,7 +19,7 @@ subroutine sub(a, lb, ub)
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_Uxi32 : !fir.ref<!fir.box<!fir.array<?xi32>>> {{.*}} alloc {
! CHECK: } combiner {
! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>, %[[ARG1:.*]]: !fir.ref<!fir.box<!fir.array<?xi32>>>):
! CHECK: %[[ARR0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.box<!fir.array<?xi32>>>
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
index afaeba27c5eae..14b657c8e180d 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90
@@ -14,7 +14,7 @@ program reduce
print *,r
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> attributes {byref_element_type = !fir.array<2xi32>} alloc {
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
index 25b2e97a1b7f7..d0a0c38e4ccb1 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90
@@ -14,7 +14,7 @@ program reduce
print *,r
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_2xi32 : !fir.ref<!fir.box<!fir.array<2xi32>>> {{.*}} alloc {
! CHECK: %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.array<2xi32>>
! CHECK: omp.yield(%[[VAL_8]] : !fir.ref<!fir.box<!fir.array<2xi32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
index edd2bcb1d6be8..60a162d8f8002 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90
@@ -24,7 +24,7 @@ program main
endprogram
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_3x3xf64 : !fir.ref<!fir.box<!fir.array<3x3xf64>>> {{.*}} alloc {
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.array<3x3xf64>>
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.array<3x3xf64>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
index 27b726376fbeb..f640f5caddf76 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90
@@ -18,7 +18,7 @@ program reduce_pointer
deallocate(v)
end program
-! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> alloc {
+! CHECK-LABEL: omp.declare_reduction @add_reduction_byref_box_ptr_i32 : !fir.ref<!fir.box<!fir.ptr<i32>>> {{.*}} alloc {
! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
! CHECK: omp.yield(%[[VAL_3]] : !fir.ref<!fir.box<!fir.ptr<i32>>>)
! CHECK-LABEL: } init {
diff --git a/flang/test/Lower/do_concurrent_reduce_allocatable.f90 b/flang/test/Lower/do_concurrent_reduce_allocatable.f90
index 873fd10dd1b97..4fb67c094b594 100644
--- a/flang/test/Lower/do_concurrent_reduce_allocatable.f90
+++ b/flang/test/Lower/do_concurrent_reduce_allocatable.f90
@@ -8,7 +8,7 @@ subroutine do_concurrent_allocatable
end do
end subroutine
-! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] alloc {
+! CHECK: fir.declare_reduction @[[RED_OP:.*]] : ![[RED_TYPE:.*]] attributes {byref_element_type = !fir.array<?x?xf32>} alloc {
! CHECK: %[[ALLOC:.*]] = fir.alloca
! CHECK: fir.yield(%[[ALLOC]] : ![[RED_TYPE]])
! CHECK: } init {
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5331cb5abdc6f..f4192f9b49fd9 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1448,11 +1448,15 @@ class OpenMPIRBuilder {
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
ReductionGenClangCBTy ReductionGenClang,
- ReductionGenAtomicCBTy AtomicReductionGen)
+ ReductionGenAtomicCBTy AtomicReductionGen,
+ Type *ByRefAllocatedType = nullptr,
+ Type *ByRefElementType = nullptr)
: ElementType(ElementType), Variable(Variable),
PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
- AtomicReductionGen(AtomicReductionGen) {}
+ AtomicReductionGen(AtomicReductionGen),
+ ByRefAllocatedType(ByRefAllocatedType),
+ ByRefElementType(ByRefElementType) {}
ReductionInfo(Value *PrivateVariable)
: ElementType(nullptr), Variable(nullptr),
PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
@@ -1485,6 +1489,9 @@ class OpenMPIRBuilder {
/// reduction. If null, the implementation will use the non-atomic version
/// along with the appropriate synchronization mechanisms.
ReductionGenAtomicCBTy AtomicReductionGen;
+
+ Type *ByRefAllocatedType;
+ Type *ByRefElementType;
};
enum class CopyAction : unsigned {
@@ -1529,14 +1536,15 @@ class OpenMPIRBuilder {
/// Function to shuffle over the value from the remote lane.
void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
- Type *ElementType, Value *Offset,
- Type *ReductionArrayTy);
+ Type *ElementType, Value *Offset, Type *ReductionArrayTy,
+ bool IsByRefElem);
/// Emit instructions to copy a Reduce list, which contains partially
/// aggregated values, in the specified direction.
void emitReductionListCopy(
InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+ ArrayRef<bool> IsByRef,
CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
/// Emit a helper that reduces data across two OpenMP threads (lanes)
@@ -1614,7 +1622,7 @@ class OpenMPIRBuilder {
/// \return The ShuffleAndReduce function.
Function *emitShuffleAndReduceFunction(
ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
- Function *ReduceFn, AttributeList FuncAttrs);
+ Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
/// Helper function for CreateCanonicalScanLoops to create InputLoop
/// in the firstGen and Scan Loop in the SecondGen
@@ -1679,7 +1687,7 @@ class OpenMPIRBuilder {
Expected<Function *>
emitInterWarpCopyFunction(const LocationDescription &Loc,
ArrayRef<ReductionInfo> ReductionInfos,
- AttributeList FuncAttrs);
+ AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
/// This function emits a helper that copies all the reduction variables from
/// the team into the provided global buffer for the reduction variables.
@@ -1773,6 +1781,7 @@ class OpenMPIRBuilder {
/// \return The reduction function.
Expected<Function *> createReductionFunction(
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+ ArrayRef<bool> IsByRef,
ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
AttributeList FuncAttrs = {});
@@ -2039,7 +2048,8 @@ class OpenMPIRBuilder {
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait = false, bool IsTeamsReduction = false,
+ ArrayRef<bool> IsByRef, bool IsNoWait = false,
+ bool IsTeamsReduction = false,
ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
Value *SrcLocInfo = nullptr);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0e5926ff0fb18..73089010c308b 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2450,7 +2450,8 @@ Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
Value *DstAddr, Type *ElemType,
- Value *Offset, Type *ReductionArrayTy) {
+ Value *Offset, Type *ReductionArrayTy,
+ bool IsByRefElem) {
uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
// Create the loop over the big sized data.
// ptr = (void*)Elem;
@@ -2535,7 +2536,7 @@ void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
void OpenMPIRBuilder::emitReductionListCopy(
InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
- CopyOptionsTy CopyOptions) {
+ ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
Type *IndexTy = Builder.getIndexTy(
M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
@@ -2545,6 +2546,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
for (auto En : enumerate(ReductionInfos)) {
const ReductionInfo &RI = En.value();
Value *SrcElementAddr = nullptr;
+ AllocaInst *DestAlloca = nullptr;
Value *DestElementAddr = nullptr;
Value *DestElementPtrAddr = nullptr;
// Should we shuffle in an element from a remote lane?
@@ -2564,14 +2566,17 @@ void OpenMPIRBuilder::emitReductionListCopy(
DestElementPtrAddr = Builder.CreateInBoundsGEP(
ReductionArrayTy, DestBase,
{ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+ bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
switch (Action) {
case CopyAction::RemoteLaneToThread: {
InsertPointTy CurIP = Builder.saveIP();
Builder.restoreIP(AllocaIP);
- AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
- ".omp.reduction.element");
+ Type *DestAllocaType =
+ IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
+ DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
+ ".omp.reduction.element");
DestAlloca->setAlignment(
- M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+ M.getDataLayout().getPrefTypeAlign(DestAllocaType));
DestElementAddr = DestAlloca;
DestElementAddr =
Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
@@ -2591,8 +2596,45 @@ void OpenMPIRBuilder::emitReductionListCopy(
// Now that all active lanes have read the element in the
// Reduce list, shuffle over the value from the remote lane.
if (ShuffleInElement) {
- shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
- RemoteLaneOffset, ReductionArrayTy);
+ Type *ShuffleType = RI.ElementType;
+ Value *ShuffleSrcAddr = SrcElementAddr;
+ Value *ShuffleDestAddr = DestElementAddr;
+ Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0);
+ AllocaInst *LocalStorage = nullptr;
+
+ if (IsByRefElem) {
+ assert(RI.ByRefElementType && "Expected by-ref element type to be set");
+ assert(RI.ByRefAllocatedType &&
+ "Expected by-ref allocated type to be set");
+ ShuffleType = RI.ByRefElementType;
+
+ ShuffleSrcAddr = Builder.CreateGEP(RI.ByRefAllocatedType,
+ ShuffleSrcAddr, {Zero, Zero});
+ ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
+
+ {
+ auto OldIP = Builder.saveIP();
+ Builder.restoreIP(AllocaIP);
+
+ LocalStorage = Builder.CreateAlloca(ShuffleType);
+ Builder.restoreIP(OldIP);
+ ShuffleDestAddr = LocalStorage;
+ }
+ }
+
+ shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
+ RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
+
+ if (IsByRefElem) {
+ auto *GEP =
+ Builder.CreateGEP(RI.ByRefAllocatedType,
+ Builder.CreatePointerBitCastOrAddrSpaceCast(
+ DestAlloca, Builder.getPtrTy(), ".ascast"),
+ {Zero, Zero});
+ Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LocalStorage, Builder.getPtrTy(), ".ascast"),
+ GEP);
+ }
} else {
switch (RI.EvaluationKind) {
case EvalKind::Scalar: {
@@ -2647,7 +2689,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
- AttributeList FuncAttrs) {
+ AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
InsertPointTy SavedIP = Builder.saveIP();
LLVMContext &Ctx = M.getContext();
FunctionType *FuncTy = FunctionType::get(
@@ -2728,7 +2770,9 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
// memory.
//
const ReductionInfo &RI = En.value();
- unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
+ bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
+ unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
+ IsByRefElem ? RI.ByRefElementType : RI.ElementType);
for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
Type *CType = Builder.getIntNTy(TySize * 8);
@@ -2791,6 +2835,15 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
ConstantInt::get(IndexTy, En.index())});
// elemptr = ((CopyType*)(elemptrptr)) + I
Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+ if (IsByRefElem) {
+ Type *Int32Ty = Builder.getInt32Ty();
+ Constant *Zero = ConstantInt::get(Int32Ty, 0);
+ ElemPtr =
+ Builder.CreateGEP(RI.ByRefAllocatedType, ElemPtr, {Zero, Zero});
+ ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
+ }
+
if (NumIters > 1)
ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
@@ -2846,6 +2899,15 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
Value *TargetElemPtrVal =
Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
Value *TargetElemPtr = TargetElemPtrVal;
+
+ if (IsByRefElem) {
+ Type *Int32Ty = Builder.getInt32Ty();
+ Constant *Zero = ConstantInt::get(Int32Ty, 0);
+ TargetElemPtr = Builder.CreateGEP(RI.ByRefAllocatedType, TargetElemPtr,
+ {Zero, Zero});
+ TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
+ }
+
if (NumIters > 1)
TargetElemPtr =
Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
@@ -2882,7 +2944,7 @@ Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
- AttributeList FuncAttrs) {
+ AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
LLVMContext &Ctx = M.getContext();
FunctionType *FuncTy =
FunctionType::get(Builder.getVoidTy(),
@@ -2961,9 +3023,10 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
// This loop iterates through the list of reduce elements and copies,
// element by element, from a remote lane in the warp to RemoteReduceList,
// hosted on the thread's stack.
- emitReductionListCopy(
- AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
- ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+ emitReductionListCopy(AllocaIP, CopyAction::RemoteLaneToThread,
+ RedListArrayTy, ReductionInfos, ReduceList,
+ RemoteListAddrCast, IsByRef,
+ {RemoteLaneOffset, nullptr, nullptr});
// The actions to be performed on the Remote Reduce list is dependent
// on the algorithm version.
@@ -3032,7 +3095,8 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
- ReductionInfos, RemoteListAddrCast, ReduceList);
+ ReductionInfos, RemoteListAddrCast, ReduceList,
+ IsByRef);
Builder.CreateBr(CpyMergeBB);
emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
@@ -3437,7 +3501,8 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
Expected<Function *> OpenMPIRBuilder::createReductionFunction(
StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
- ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
+ ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind,
+ AttributeList FuncAttrs) {
auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
{Builder.getPtrTy(), Builder.getPtrTy()},
/* IsVarArg */ false);
@@ -3498,8 +3563,14 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
LHSPtrs.emplace_back(LHSPtr);
RHSPtrs.emplace_back(RHSPtr);
} else {
- Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
- Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *LHS = LHSPtr;
+ Value *RHS = RHSPtr;
+
+ if (!IsByRef.empty() && !IsByRef[En.index()]) {
+ LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ }
+
Value *Reduced;
InsertPointOrErrorTy AfterIP =
RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
@@ -3509,7 +3580,9 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
return ReductionFunc;
Builder.restoreIP(*AfterIP);
- Builder.CreateStore(Reduced, LHSPtr);
+
+ if (!IsByRef.empty() && !IsByRef[En.index()])
+ Builder.CreateStore(Reduced, LHSPtr);
}
}
@@ -3562,9 +3635,9 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
- std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
- Value *SrcLocInfo) {
+ ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
+ ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
+ unsigned ReductionBufNum, Value *SrcLocInfo) {
if (!updateToLocation(Loc))
return InsertPointTy();
Builder.restoreIP(CodeGenIP);
@@ -3600,9 +3673,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
CodeGenIP = Builder.saveIP();
- Expected<Function *> ReductionResult =
- createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
- ReductionInfos, ReductionGenCBKind, FuncAttrs);
+ Expected<Function *> ReductionResult = createReductionFunction(
+ Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
+ ReductionGenCBKind, FuncAttrs);
if (!ReductionResult)
return ReductionResult.takeError();
Function *ReductionFunc = *ReductionResult;
@@ -3641,15 +3714,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
Value *ElemPtr = Builder.CreateInBoundsGEP(
RedArrayTy, ReductionList,
{ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+
+ auto *PrviateVar = RI.PrivateVariable;
+ bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
+ if (IsByRefElem)
+ PrviateVar = Builder.CreateLoad(RI.ElementType, PrviateVar);
+
Value *CastElem =
- Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+ Builder.CreatePointerBitCastOrAddrSpaceCast(PrviateVar, PtrTy);
Builder.CreateStore(CastElem, ElemPtr);
}
CodeGenIP = Builder.saveIP();
- Function *SarFunc =
- emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+ Function *SarFunc = emitShuffleAndReduceFunction(
+ ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
Expected<Function *> CopyResult =
- emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+ emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
if (!CopyResult)
return CopyResult.takeError();
Function *WcFunc = *CopyResult;
@@ -3728,7 +3807,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
for (auto En : enumerate(ReductionInfos)) {
const ReductionInfo &RI = En.value();
- Value *LHS = RI.Variable;
+ Type *ValueType = RI.ElementType;
+ Value *RedValue = RI.Variable;
Value *RHS =
Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
@@ -3739,7 +3819,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
// Fix the CallBack code genereated to use the correct Values for the LHS
// and RHS
- LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+ LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
return cast<Instruction>(U.getUser())->getParent()->getParent() ==
ReductionFunc;
});
@@ -3748,15 +3828,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
ReductionFunc;
});
} else {
- Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
- Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
+ if (IsByRef.empty() || !IsByRef[En.index()]) {
+ RedValue = Builder.CreateLoad(ValueType, RI.Variable,
+ "red.value." + Twine(En.index()));
+ }
+ Value *PrivateRedValue = Builder.CreateLoad(
+ ValueType, RHS, "red.private.value" + Twine(En.index()));
Value *Reduced;
InsertPointOrErrorTy AfterIP =
- RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
+ RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
if (!AfterIP)
return AfterIP.takeError();
Builder.restoreIP(*AfterIP);
- Builder.CreateStore(Reduced, LHS, false);
+
+ if (!IsByRef.empty() && !IsByRef[En.index()])
+ Builder.CreateStore(Reduced, RI.Variable);
}
}
emitBlock(ExitBB, CurFunc);
@@ -3857,7 +3943,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
assert(ReductionInfos.size() == IsByRef.size());
if (Config.isGPU())
return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
- IsNoWait, IsTeamsReduction);
+ IsByRef, IsNoWait, IsTeamsReduction);
checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 377f1febf6b8f..386174a36d52c 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -2011,7 +2011,9 @@ def DeclareReductionOp : OpenMP_Op<"declare_reduction", [IsolatedFromAbove,
}];
let arguments = (ins SymbolNameAttr:$sym_name,
- TypeAttr:$type);
+ TypeAttr:$type,
+ OptionalAttr<TypeAttr>:$byref_element_type
+ );
let regions = (region MaxSizedRegion<1>:$allocRegion,
AnyRegion:$initializerRegion,
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 460595ba9f254..6423d49859c97 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -188,7 +188,8 @@ createDecl(PatternRewriter &builder, SymbolTable &symbolTable,
OpBuilder::InsertionGuard guard(builder);
Type type = reduce.getOperands()[reductionIndex].getType();
auto decl = omp::DeclareReductionOp::create(builder, reduce.getLoc(),
- "__scf_reduction", type);
+ "__scf_reduction", type,
+ /*byref_element_type=*/{});
symbolTable.insert(decl);
builder.createBlock(&decl.getInitializerRegion(),
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f28454075f1d3..5a8b915260b3c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1311,7 +1311,8 @@ static void collectReductionInfo(
SmallVectorImpl<OwningReductionGen> &owningReductionGens,
SmallVectorImpl<OwningAtomicReductionGen> &owningAtomicReductionGens,
const ArrayRef<llvm::Value *> privateReductionVariables,
- SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) {
+ SmallVectorImpl<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos,
+ ArrayRef<bool> isByRef) {
unsigned numReductions = loop.getNumReductionVars();
for (unsigned i = 0; i < numReductions; ++i) {
@@ -1329,12 +1330,27 @@ static void collectReductionInfo(
atomicGen = owningAtomicReductionGens[i];
llvm::Value *variable =
moduleTranslation.lookupValue(loop.getReductionVars()[i]);
+ mlir::Type allocatedType;
+ reductionDecls[i].getAllocRegion().walk([&](mlir::Operation *op) {
+ if (auto alloca = mlir::dyn_cast<LLVM::AllocaOp>(op)) {
+ allocatedType = alloca.getElemType();
+ return mlir::WalkResult::interrupt();
+ }
+
+ return mlir::WalkResult::advance();
+ });
+
reductionInfos.push_back(
{moduleTranslation.convertType(reductionDecls[i].getType()), variable,
privateReductionVariables[i],
/*EvaluationKind=*/llvm::OpenMPIRBuilder::EvalKind::Scalar,
owningReductionGens[i],
- /*ReductionGenClang=*/nullptr, atomicGen});
+ /*ReductionGenClang=*/nullptr, atomicGen,
+ allocatedType ? moduleTranslation.convertType(allocatedType) : nullptr,
+ reductionDecls[i].getByrefElementType()
+ ? moduleTranslation.convertType(
+ *reductionDecls[i].getByrefElementType())
+ : nullptr});
}
}
@@ -1400,7 +1416,7 @@ static LogicalResult createReductionsAndCleanup(
// ReductionInfo only accepts references to the generators.
collectReductionInfo(op, builder, moduleTranslation, reductionDecls,
owningReductionGens, owningAtomicReductionGens,
- privateReductionVariables, reductionInfos);
+ privateReductionVariables, reductionInfos, isByRef);
// The call to createReductions below expects the block to have a
// terminator. Create an unreachable instruction to serve as terminator
@@ -2732,7 +2748,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
collectReductionInfo(opInst, builder, moduleTranslation, reductionDecls,
owningReductionGens, owningAtomicReductionGens,
- privateReductionVariables, reductionInfos);
+ privateReductionVariables, reductionInfos, isByRef);
// Move to region cont block
builder.SetInsertPoint((*regionBlock)->getTerminator());
diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
new file mode 100644
index 0000000000000..af3f5e68b6ddb
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir
@@ -0,0 +1,92 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+ omp.private {type = private} @_QFfooEi_private_i32 : i32
+ omp.declare_reduction @add_reduction_byref_box_heap_f32 : !llvm.ptr attributes {byref_element_type = f32} alloc {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ omp.yield(%2 : !llvm.ptr)
+ } init {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ omp.yield(%arg1 : !llvm.ptr)
+ } combiner {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ %3 = llvm.mlir.constant(1 : i32) : i32
+ %4 = llvm.alloca %3 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5>
+ %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+ %6 = llvm.mlir.constant(24 : i32) : i32
+ "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+ %7 = llvm.mlir.constant(24 : i32) : i32
+ "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+ %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+ %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr
+ %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+ %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr
+ %12 = llvm.load %9 : !llvm.ptr -> f32
+ %13 = llvm.load %11 : !llvm.ptr -> f32
+ %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath<contract>} : f32
+ llvm.store %14, %9 : f32, !llvm.ptr
+ omp.yield(%arg0 : !llvm.ptr)
+ }
+ llvm.func @foo_() {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5>
+ %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr
+ %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>
+ %9 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) map_clauses(implicit, tofrom) capture(ByRef) var_ptr_ptr(%8 : !llvm.ptr) -> !llvm.ptr {name = ""}
+ %10 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, implicit, descriptor, to) capture(ByRef) members(%9 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar_alloc"}
+ omp.target map_entries(%10 -> %arg0 : !llvm.ptr) {
+ %13 = llvm.mlir.constant(1000 : i32) : i32
+ %14 = llvm.mlir.constant(1 : i32) : i32
+ omp.parallel {
+ omp.wsloop reduction(byref @add_reduction_byref_box_heap_f32 %arg0 -> %arg4 : !llvm.ptr) {
+ omp.loop_nest (%arg5) : i32 = (%14) to (%13) inclusive step (%14) {
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+}
+
+// CHECK: define {{.*}} @_omp_reduction_shuffle_and_reduce_func({{.*}}) {{.*}} {
+// CHECK: %[[REMOTE_RED_LIST:.omp.reduction.remote_reduce_list]] = alloca [1 x ptr], align 8, addrspace(5)
+// CHECK: %[[RED_ELEM:.omp.reduction.element]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5)
+// CHECK: %[[RED_ELEM_1:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr
+
+// CHECK: %[[SHUFFLE_ELEM:.*]] = alloca float, align 4, addrspace(5)
+// CHECK: %[[REMOTE_RED_LIST_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[REMOTE_RED_LIST]] to ptr
+
+// CHECK: %[[REMOTE_RED_LIST_ELEM0:.*]] = getelementptr inbounds [1 x ptr], ptr %[[REMOTE_RED_LIST_ASCAST]], i64 0, i64 0
+
+// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr
+// CHECK: %[[SHUFFLE_RES:.*]] = call i32 @__kmpc_shuffle_int32({{.*}})
+// CHECK: store i32 %[[SHUFFLE_RES]], ptr %[[SHUFFLE_ELEM_ASCAST]], align 4
+
+// CHECK: %[[RED_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[RED_ELEM]] to ptr
+// CHECK: %[[RED_ALLOC_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_ASCAST]], i32 0, i32 0
+// CHECK: %[[SHUFFLE_ELEM_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[SHUFFLE_ELEM]] to ptr
+// CHECK: store ptr %[[SHUFFLE_ELEM_ASCAST]], ptr %[[RED_ALLOC_PTR]], align 8
+// CHECK: store ptr %[[RED_ELEM_1]], ptr %[[REMOTE_RED_LIST_ELEM0]], align 8
+// CHECK: }
+
+// CHECK: define {{.*}} @_omp_reduction_inter_warp_copy_func({{.*}}) {{.*}} {
+// CHECK: %[[WARP_MASTER_CMP:.*]] = icmp eq i32 %nvptx_lane_id, 0
+// CHECK: br i1 %[[WARP_MASTER_CMP]], label %[[WARP_MASTER_BB:.*]], label %{{.*}}
+
+// CHECK: [[WARP_MASTER_BB]]:
+// CHECK: %[[WARP_RESULT_PTR:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0
+// CHECK: %[[WARP_RESULT:.*]] = load ptr, ptr %[[WARP_RESULT_PTR]], align 8
+// CHECK: %[[ALLOC_MEM_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[WARP_RESULT]], i32 0, i32 0
+// CHECK: %[[ALLOC_MEM:.*]] = load ptr, ptr %[[ALLOC_MEM_PTR]], align 8
+// CHECK: %[[WARP_TRANSFER_SLOT:.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 %nvptx_warp_id
+// CHECK: %[[WARP_RED_RES:.*]] = load i32, ptr %[[ALLOC_MEM]], align 4
+// CHECK: store volatile i32 %[[WARP_RED_RES]], ptr addrspace(3) %[[WARP_TRANSFER_SLOT]], align 4
+// CHECK: }
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
index 87ff0ba786648..08a738c8fe4c6 100644
--- a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
@@ -7,7 +7,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 :
llvm.func @bar() {}
llvm.func @baz() {}
- omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc {
+ omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr attributes {byref_element_type = !llvm.array<5 x f32>} alloc {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5>
%2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
@@ -67,9 +67,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 :
// CHECK: br label %[[CONT_BB:.*]]
// CHECK: [[CONT_BB]]:
-// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %final.rhs, %{{.*}} ]
-// CHECK-NEXT: store ptr %[[RED_RHS]], ptr %{{.*}}, align 8
-// CHECK-NEXT: br label %.omp.reduction.done
+// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %{{.*}}, %{{.*}} ]
// CHECK: }
// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 {
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
index b8b7c780a74d0..8950db3fc48aa 100644
--- a/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir
@@ -109,19 +109,19 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: icmp eq i32 %[[MASTER]], 1
// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
// CHECK: [[THEN]]:
-// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double
// CHECK-NEXT: %[[FINAL_LHS0:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RHS0:[A-Za-z0-9_.]*]] = load double
// CHECK-NEXT: %[[FINAL_RESULT0:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS0]], %[[FINAL_RHS0]]
// CHECK-NEXT: store double %[[FINAL_RESULT0]]
-// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double
// CHECK-NEXT: %[[FINAL_LHS1:[A-Za-z0-9_.]*]] = load double
+// CHECK-NEXT: %[[FINAL_RHS1:[A-Za-z0-9_.]*]] = load double
// CHECK-NEXT: %[[FINAL_RESULT1:[A-Za-z0-9_.]*]] = fadd contract double %[[FINAL_LHS1]], %[[FINAL_RHS1]]
// CHECK-NEXT: store double %[[FINAL_RESULT1]]
-// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float
// CHECK-NEXT: %[[FINAL_LHS2:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RHS2:[A-Za-z0-9_.]*]] = load float
// CHECK-NEXT: %[[FINAL_RESULT2:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS2]], %[[FINAL_RHS2]]
// CHECK-NEXT: store float %[[FINAL_RESULT2]]
-// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float
// CHECK-NEXT: %[[FINAL_LHS3:[A-Za-z0-9_.]*]] = load float
+// CHECK-NEXT: %[[FINAL_RHS3:[A-Za-z0-9_.]*]] = load float
// CHECK-NEXT: %[[FINAL_RESULT3:[A-Za-z0-9_.]*]] = fadd contract float %[[FINAL_LHS3]], %[[FINAL_RHS3]]
// CHECK-NEXT: store float %[[FINAL_RESULT3]]
diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir
index 9aba72dabf13c..b7cb1026967f3 100644
--- a/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir
@@ -59,8 +59,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: call void @__kmpc_barrier
// CHECK: [[THEN]]:
-// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32
+// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]]
// CHECK-NEXT: store i32 %[[FINAL_RESULT]]
diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
index dc22fe11666cf..36eb280dfcfa2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir
@@ -62,8 +62,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: icmp eq i32 %[[MASTER]], 1
// CHECK: i1 %{{.+}}, label %[[THEN:[A-Za-z0-9_.]*]], label %[[DONE:[A-Za-z0-9_.]*]]
// CHECK: [[THEN]]:
-// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[FINAL_LHS:[A-Za-z0-9_.]*]] = load i32
+// CHECK-NEXT: %[[FINAL_RHS:[A-Za-z0-9_.]*]] = load i32
// CHECK-NEXT: %[[FINAL_RESULT:[A-Za-z0-9_.]*]] = add i32 %[[FINAL_LHS]], %[[FINAL_RHS]]
// CHECK-NEXT: store i32 %[[FINAL_RESULT]]
More information about the cfe-commits
mailing list