[Mlir-commits] [mlir] [mlir][OpenMP] implement SIMD reduction (PR #146671)
Tom Eccles
llvmlistbot at llvm.org
Wed Jul 2 04:03:01 PDT 2025
https://github.com/tblah created https://github.com/llvm/llvm-project/pull/146671
This replicates clang's implementation. Basically:
- A private copy of the reduction variable is created, initialized to the reduction neutral value (using regions from the reduction declaration op).
- The body of the loop is lowered as usual, with accesses to the reduction variable mapped to the private copy.
- After the loop, we inline the reduction region from the declaration op to combine the privatized variable into the original variable.
- As usual with the SIMD construct, attributes are added to encourage vectorization of the loop and to assert that memory accesses in the loop don't alias across iterations.
I have verified that simple scalar examples do vectorize at -O3 and the tests I could find in the Fujitsu test suite produce correct results. I tested on top of #146097 and this seemed to work for composite constructs as well.
Fixes #144290
>From 4b5373a20c084b6c2dea790571e1b5b9633006ec Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Fri, 27 Jun 2025 17:26:17 +0000
Subject: [PATCH] [mlir][OpenMP] implement SIMD reduction
This replicates clang's implementation. Basically:
- A private copy of the reduction variable is created, initialized to
the reduction neutral value (using regions from the reduction
declaration op).
- The body of the loop is lowered as usual, with accesses to the
reduction variable mapped to the private copy.
- After the loop, we inline the reduction region from the declaration
op to combine the privatized variable into the original variable.
- As usual with the SIMD construct, attributes are added to encourage
vectorization of the loop and to assert that memory accesses in the
loop don't alias across iterations.
I have verified that simple scalar examples do vectorize at -O3 and the
tests I could find in the Fujitsu test suite produce correct results. I
tested on top of #146097 and this seemed to work for composite constructs
as well.
Fixes #144290
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 73 +++++++++++++-
.../LLVMIR/openmp-simd-reduction-byref.mlir | 98 +++++++++++++++++++
.../LLVMIR/openmp-simd-reduction-simple.mlir | 96 ++++++++++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 30 ------
4 files changed, 266 insertions(+), 31 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-simd-reduction-byref.mlir
create mode 100644 mlir/test/Target/LLVMIR/openmp-simd-reduction-simple.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3806db3ceab25..0f2541ccefe78 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -370,7 +370,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
}
};
auto checkReduction = [&todo](auto op, LogicalResult &result) {
- if (isa<omp::TeamsOp>(op) || isa<omp::SimdOp>(op))
+ if (isa<omp::TeamsOp>(op))
if (!op.getReductionVars().empty() || op.getReductionByref() ||
op.getReductionSyms())
result = todo("reduction");
@@ -2864,6 +2864,17 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
PrivateVarsInfo privateVarsInfo(simdOp);
+ MutableArrayRef<BlockArgument> reductionArgs =
+ cast<omp::BlockArgOpenMPOpInterface>(opInst).getReductionBlockArgs();
+ DenseMap<Value, llvm::Value *> reductionVariableMap;
+ SmallVector<llvm::Value *> privateReductionVariables(
+ simdOp.getNumReductionVars());
+ SmallVector<DeferredStore> deferredStores;
+ SmallVector<omp::DeclareReductionOp> reductionDecls;
+ collectReductionDecls(simdOp, reductionDecls);
+ llvm::ArrayRef<bool> isByRef = getIsByRef(simdOp.getReductionByref());
+ assert(isByRef.size() == simdOp.getNumReductionVars());
+
llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
findAllocaInsertPoint(builder, moduleTranslation);
@@ -2872,11 +2883,27 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
if (handleError(afterAllocas, opInst).failed())
return failure();
+ if (failed(allocReductionVars(simdOp, reductionArgs, builder,
+ moduleTranslation, allocaIP, reductionDecls,
+ privateReductionVariables, reductionVariableMap,
+ deferredStores, isByRef)))
+ return failure();
+
if (handleError(initPrivateVars(builder, moduleTranslation, privateVarsInfo),
opInst)
.failed())
return failure();
+ // TODO: no call to copyFirstPrivateVars?
+
+ assert(afterAllocas.get()->getSinglePredecessor());
+ if (failed(initReductionVars(simdOp, reductionArgs, builder,
+ moduleTranslation,
+ afterAllocas.get()->getSinglePredecessor(),
+ reductionDecls, privateReductionVariables,
+ reductionVariableMap, isByRef, deferredStores)))
+ return failure();
+
llvm::ConstantInt *simdlen = nullptr;
if (std::optional<uint64_t> simdlenVar = simdOp.getSimdlen())
simdlen = builder.getInt64(simdlenVar.value());
@@ -2921,6 +2948,50 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
: nullptr,
order, simdlen, safelen);
+ // We now need to reduce the per-simd-lane reduction variable into the
+ // original variable. This works a bit differently to other reductions (e.g.
+ // wsloop) because we don't need to call into the OpenMP runtime to handle
+ // threads: everything happened in this one thread.
+ for (auto [i, tuple] : llvm::enumerate(
+ llvm::zip(reductionDecls, isByRef, simdOp.getReductionVars(),
+ privateReductionVariables))) {
+ auto [decl, byRef, reductionVar, privateReductionVar] = tuple;
+
+ OwningReductionGen gen = makeReductionGen(decl, builder, moduleTranslation);
+ llvm::Value *originalVariable = moduleTranslation.lookupValue(reductionVar);
+ llvm::Type *reductionType = moduleTranslation.convertType(decl.getType());
+
+ // We have one less load for by-ref case because that load is now inside of
+ // the reduction region
+ llvm::Value *redValue = originalVariable;
+ if (!byRef)
+ redValue =
+ builder.CreateLoad(reductionType, redValue, "red.value." + Twine(i));
+ llvm::Value *privateRedValue = builder.CreateLoad(
+ reductionType, privateReductionVar, "red.private.value." + Twine(i));
+ llvm::Value *reduced;
+
+ auto res = gen(builder.saveIP(), redValue, privateRedValue, reduced);
+ if (failed(handleError(res, opInst)))
+ return failure();
+ builder.restoreIP(res.get());
+
+ // for by-ref case, the store is inside of the reduction region
+ if (!byRef)
+ builder.CreateStore(reduced, originalVariable);
+ }
+
+ // After the construct, deallocate private reduction variables
+ SmallVector<Region *> reductionRegions;
+ llvm::transform(reductionDecls, std::back_inserter(reductionRegions),
+ [](omp::DeclareReductionOp reductionDecl) {
+ return &reductionDecl.getCleanupRegion();
+ });
+ if (failed(inlineOmpRegionCleanup(reductionRegions, privateReductionVariables,
+ moduleTranslation, builder,
+ "omp.reduction.cleanup")))
+ return failure();
+
return cleanupPrivateVars(builder, moduleTranslation, simdOp.getLoc(),
privateVarsInfo.llvmVars,
privateVarsInfo.privatizers);
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-simd-reduction-byref.mlir
new file mode 100644
index 0000000000000..1344ea99e656c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-reduction-byref.mlir
@@ -0,0 +1,98 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @init(%arg0: !llvm.ptr {llvm.nocapture}, %arg1: !llvm.ptr {llvm.nocapture}) {
+ llvm.return
+}
+llvm.func @combine(%arg0: !llvm.ptr {llvm.nocapture}, %arg1: !llvm.ptr {llvm.nocapture}) {
+ llvm.return
+}
+llvm.func @cleanup(%arg0: !llvm.ptr {llvm.nocapture}) {
+ llvm.return
+}
+omp.private {type = private} @_QFsimd_reductionEi_private_i32 : i32
+omp.declare_reduction @add_reduction_byref_box_2xf32 : !llvm.ptr alloc {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+ omp.yield(%1 : !llvm.ptr)
+} init {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ llvm.call @init(%arg0, %arg1) : (!llvm.ptr, !llvm.ptr) -> ()
+ omp.yield(%arg1 : !llvm.ptr)
+} combiner {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ llvm.call @combine(%arg0, %arg1) : (!llvm.ptr, !llvm.ptr) -> ()
+ omp.yield(%arg0 : !llvm.ptr)
+} cleanup {
+^bb0(%arg0: !llvm.ptr):
+ llvm.call @cleanup(%arg0) : (!llvm.ptr) -> ()
+ omp.yield
+}
+llvm.func @_QPsimd_reduction(%arg0: !llvm.ptr {fir.bindc_name = "a", llvm.nocapture}, %arg1: !llvm.ptr {fir.bindc_name = "sum", llvm.nocapture}) {
+ %0 = llvm.mlir.constant(1024 : i32) : i32
+ %1 = llvm.mlir.constant(1 : i32) : i32
+ %2 = llvm.mlir.constant(1 : i64) : i64
+ %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr
+ %4 = llvm.alloca %2 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+ omp.simd private(@_QFsimd_reductionEi_private_i32 %4 -> %arg2 : !llvm.ptr) reduction(byref @add_reduction_byref_box_2xf32 %3 -> %arg3 : !llvm.ptr) {
+ omp.loop_nest (%arg4) : i32 = (%1) to (%0) inclusive step (%1) {
+ llvm.store %arg4, %arg2 : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @_QPsimd_reduction
+// CHECK: %[[VAL_0:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK: %[[VAL_1:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_2:.*]] = alloca i32, align 4
+// CHECK: %[[VAL_3:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+// CHECK: %[[VAL_4:.*]] = alloca ptr, align 8
+// CHECK: br label %[[VAL_5:.*]]
+// CHECK: omp.region.after_alloca: ; preds = %[[VAL_6:.*]]
+// CHECK: br label %[[VAL_7:.*]]
+// CHECK: entry: ; preds = %[[VAL_5]]
+// CHECK: br label %[[VAL_8:.*]]
+// CHECK: omp.private.init: ; preds = %[[VAL_7]]
+// CHECK: br label %[[VAL_9:.*]]
+// CHECK: omp.reduction.init: ; preds = %[[VAL_8]]
+// CHECK: store ptr %[[VAL_3]], ptr %[[VAL_4]], align 8
+// CHECK: call void @init(ptr %[[VAL_0]], ptr %[[VAL_3]])
+// CHECK: br label %[[VAL_10:.*]]
+// CHECK: omp.simd.region: ; preds = %[[VAL_9]]
+// CHECK: br label %[[VAL_11:.*]]
+// CHECK: omp_loop.preheader: ; preds = %[[VAL_10]]
+// CHECK: br label %[[VAL_12:.*]]
+// CHECK: omp_loop.header: ; preds = %[[VAL_13:.*]], %[[VAL_11]]
+// CHECK: %[[VAL_14:.*]] = phi i32 [ 0, %[[VAL_11]] ], [ %[[VAL_15:.*]], %[[VAL_13]] ]
+// CHECK: br label %[[VAL_16:.*]]
+// CHECK: omp_loop.cond: ; preds = %[[VAL_12]]
+// CHECK: %[[VAL_17:.*]] = icmp ult i32 %[[VAL_14]], 1024
+// CHECK: br i1 %[[VAL_17]], label %[[VAL_18:.*]], label %[[VAL_19:.*]]
+// CHECK: omp_loop.body: ; preds = %[[VAL_16]]
+// CHECK: %[[VAL_20:.*]] = mul i32 %[[VAL_14]], 1
+// CHECK: %[[VAL_21:.*]] = add i32 %[[VAL_20]], 1
+// CHECK: br label %[[VAL_22:.*]]
+// CHECK: omp.loop_nest.region: ; preds = %[[VAL_18]]
+// CHECK: store i32 %[[VAL_21]], ptr %[[VAL_2]], align 4, !llvm.access.group ![[ACCESS_GROUP:.*]]
+// CHECK: br label %[[VAL_23:.*]]
+// CHECK: omp.region.cont1: ; preds = %[[VAL_22]]
+// CHECK: br label %[[VAL_13]]
+// CHECK: omp_loop.inc: ; preds = %[[VAL_23]]
+// CHECK: %[[VAL_15]] = add nuw i32 %[[VAL_14]], 1
+// CHECK: br label %[[VAL_12]], !llvm.loop ![[LOOP:.*]]
+// CHECK: omp_loop.exit: ; preds = %[[VAL_16]]
+// CHECK: br label %[[VAL_24:.*]]
+// CHECK: omp_loop.after: ; preds = %[[VAL_19]]
+// CHECK: br label %[[VAL_25:.*]]
+// CHECK: omp.region.cont: ; preds = %[[VAL_24]]
+// CHECK: %[[VAL_26:.*]] = load ptr, ptr %[[VAL_4]], align 8
+// CHECK: call void @combine(ptr %[[VAL_0]], ptr %[[VAL_26]])
+// CHECK: %[[VAL_27:.*]] = load ptr, ptr %[[VAL_4]], align 8
+// CHECK: call void @cleanup(ptr %[[VAL_27]])
+// CHECK: ret void
+
+// CHECK: ![[ACCESS_GROUP]] = distinct !{}
+// CHECK: ![[LOOP]] = distinct !{![[LOOP]], ![[PARALLEL_ACCESS:.*]], ![[VECTORIZE:.*]]}
+// CHECK: ![[PARALLEL_ACCESS]] = !{!"llvm.loop.parallel_accesses", ![[ACCESS_GROUP]]}
+// CHECK: ![[VECTORIZE]] = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/mlir/test/Target/LLVMIR/openmp-simd-reduction-simple.mlir b/mlir/test/Target/LLVMIR/openmp-simd-reduction-simple.mlir
new file mode 100644
index 0000000000000..823bafb121b43
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-simd-reduction-simple.mlir
@@ -0,0 +1,96 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @_QFsimd_reductionEi_private_i32 : i32
+omp.declare_reduction @add_reduction_f32 : f32 init {
+^bb0(%arg0: f32):
+ %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+ omp.yield(%0 : f32)
+} combiner {
+^bb0(%arg0: f32, %arg1: f32):
+ %0 = llvm.fadd %arg0, %arg1 {fastmathFlags = #llvm.fastmath<contract>} : f32
+ omp.yield(%0 : f32)
+}
+llvm.func @_QPsimd_reduction(%arg0: !llvm.ptr {fir.bindc_name = "a", llvm.nocapture}, %arg1: !llvm.ptr {fir.bindc_name = "sum", llvm.nocapture}) {
+ %0 = llvm.mlir.constant(0.000000e+00 : f32) : f32
+ %1 = llvm.mlir.constant(1 : i32) : i32
+ %2 = llvm.mlir.constant(1024 : i32) : i32
+ %3 = llvm.mlir.constant(1 : i64) : i64
+ %4 = llvm.alloca %3 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+ llvm.store %0, %arg1 : f32, !llvm.ptr
+ omp.simd private(@_QFsimd_reductionEi_private_i32 %4 -> %arg2 : !llvm.ptr) reduction(@add_reduction_f32 %arg1 -> %arg3 : !llvm.ptr) {
+ omp.loop_nest (%arg4) : i32 = (%1) to (%2) inclusive step (%1) {
+ llvm.store %arg4, %arg2 : i32, !llvm.ptr
+ %5 = llvm.load %arg3 : !llvm.ptr -> f32
+ %6 = llvm.load %arg2 : !llvm.ptr -> i32
+ %7 = llvm.sext %6 : i32 to i64
+ %8 = llvm.sub %7, %3 overflow<nsw> : i64
+ %9 = llvm.getelementptr %arg0[%8] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+ %10 = llvm.load %9 : !llvm.ptr -> f32
+ %11 = llvm.fadd %5, %10 {fastmathFlags = #llvm.fastmath<contract>} : f32
+ llvm.store %11, %arg3 : f32, !llvm.ptr
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @_QPsimd_reduction(
+// CHECK: %[[VAL_0:.*]] = alloca i32, i64 1, align 4
+// CHECK: store float 0.000000e+00, ptr %[[VAL_1:.*]], align 4
+// CHECK: %[[VAL_2:.*]] = alloca i32, align 4
+// CHECK: %[[VAL_3:.*]] = alloca float, align 4
+// CHECK: br label %[[VAL_4:.*]]
+// CHECK: omp.region.after_alloca: ; preds = %[[VAL_5:.*]]
+// CHECK: br label %[[VAL_6:.*]]
+// CHECK: entry: ; preds = %[[VAL_4]]
+// CHECK: br label %[[VAL_7:.*]]
+// CHECK: omp.private.init: ; preds = %[[VAL_6]]
+// CHECK: br label %[[VAL_8:.*]]
+// CHECK: omp.reduction.init: ; preds = %[[VAL_7]]
+// CHECK: store float 0.000000e+00, ptr %[[VAL_3]], align 4
+// CHECK: br label %[[VAL_9:.*]]
+// CHECK: omp.simd.region: ; preds = %[[VAL_8]]
+// CHECK: br label %[[VAL_10:.*]]
+// CHECK: omp_loop.preheader: ; preds = %[[VAL_9]]
+// CHECK: br label %[[VAL_11:.*]]
+// CHECK: omp_loop.header: ; preds = %[[VAL_12:.*]], %[[VAL_10]]
+// CHECK: %[[VAL_13:.*]] = phi i32 [ 0, %[[VAL_10]] ], [ %[[VAL_14:.*]], %[[VAL_12]] ]
+// CHECK: br label %[[VAL_15:.*]]
+// CHECK: omp_loop.cond: ; preds = %[[VAL_11]]
+// CHECK: %[[VAL_16:.*]] = icmp ult i32 %[[VAL_13]], 1024
+// CHECK: br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK: omp_loop.body: ; preds = %[[VAL_15]]
+// CHECK: %[[VAL_19:.*]] = mul i32 %[[VAL_13]], 1
+// CHECK: %[[VAL_20:.*]] = add i32 %[[VAL_19]], 1
+// CHECK: br label %[[VAL_21:.*]]
+// CHECK: omp.loop_nest.region: ; preds = %[[VAL_17]]
+// CHECK: store i32 %[[VAL_20]], ptr %[[VAL_2]], align 4, !llvm.access.group ![[ACCESS_GROUP:.*]]
+// CHECK: %[[VAL_22:.*]] = load float, ptr %[[VAL_3]], align 4, !llvm.access.group ![[ACCESS_GROUP]]
+// CHECK: %[[VAL_23:.*]] = load i32, ptr %[[VAL_2]], align 4, !llvm.access.group ![[ACCESS_GROUP]]
+// CHECK: %[[VAL_24:.*]] = sext i32 %[[VAL_23]] to i64
+// CHECK: %[[VAL_25:.*]] = sub nsw i64 %[[VAL_24]], 1
+// CHECK: %[[VAL_26:.*]] = getelementptr float, ptr %[[VAL_27:.*]], i64 %[[VAL_25]]
+// CHECK: %[[VAL_28:.*]] = load float, ptr %[[VAL_26]], align 4, !llvm.access.group ![[ACCESS_GROUP]]
+// CHECK: %[[VAL_29:.*]] = fadd contract float %[[VAL_22]], %[[VAL_28]]
+// CHECK: store float %[[VAL_29]], ptr %[[VAL_3]], align 4, !llvm.access.group ![[ACCESS_GROUP]]
+// CHECK: br label %[[VAL_30:.*]]
+// CHECK: omp.region.cont1: ; preds = %[[VAL_21]]
+// CHECK: br label %[[VAL_12]]
+// CHECK: omp_loop.inc: ; preds = %[[VAL_30]]
+// CHECK: %[[VAL_14]] = add nuw i32 %[[VAL_13]], 1
+// CHECK: br label %[[VAL_11]], !llvm.loop ![[LOOP:.*]]
+// CHECK: omp_loop.exit: ; preds = %[[VAL_15]]
+// CHECK: br label %[[VAL_31:.*]]
+// CHECK: omp_loop.after: ; preds = %[[VAL_18]]
+// CHECK: br label %[[VAL_32:.*]]
+// CHECK: omp.region.cont: ; preds = %[[VAL_31]]
+// CHECK: %[[VAL_33:.*]] = load float, ptr %[[VAL_1]], align 4
+// CHECK: %[[VAL_34:.*]] = load float, ptr %[[VAL_3]], align 4
+// CHECK: %[[VAL_35:.*]] = fadd contract float %[[VAL_33]], %[[VAL_34]]
+// CHECK: store float %[[VAL_35]], ptr %[[VAL_1]], align 4
+// CHECK: ret void
+
+// CHECK: ![[ACCESS_GROUP]] = distinct !{}
+// CHECK: ![[LOOP]] = distinct !{![[LOOP]], ![[PARALLEL_ACCESS:.*]], ![[VECTORIZE:.*]]}
+// CHECK: ![[PARALLEL_ACCESS]] = !{!"llvm.loop.parallel_accesses", ![[ACCESS_GROUP]]}
+// CHECK: ![[VECTORIZE]] = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 97608ca3b4df1..29725a02c075a 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -143,36 +143,6 @@ llvm.func @simd_linear(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
// -----
-omp.declare_reduction @add_f32 : f32
-init {
-^bb0(%arg: f32):
- %0 = llvm.mlir.constant(0.0 : f32) : f32
- omp.yield (%0 : f32)
-}
-combiner {
-^bb1(%arg0: f32, %arg1: f32):
- %1 = llvm.fadd %arg0, %arg1 : f32
- omp.yield (%1 : f32)
-}
-atomic {
-^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
- %2 = llvm.load %arg3 : !llvm.ptr -> f32
- llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
- omp.yield
-}
-llvm.func @simd_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
- // expected-error at below {{not yet implemented: Unhandled clause reduction in omp.simd operation}}
- // expected-error at below {{LLVM Translation failed for operation: omp.simd}}
- omp.simd reduction(@add_f32 %x -> %prv : !llvm.ptr) {
- omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
- omp.yield
- }
- }
- llvm.return
-}
-
-// -----
-
omp.declare_reduction @add_f32 : f32
init {
^bb0(%arg: f32):
More information about the Mlir-commits
mailing list