[llvm-branch-commits] [flang] [mlir] [flang][OpenMP] Lower target in_reduction for host fallback (PR #199967)
Sairudra More via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu May 28 04:54:07 PDT 2026
https://github.com/Saieiei updated https://github.com/llvm/llvm-project/pull/199967
>From c3847fed57dfdc41061291b90c6e6ca1ff950d28 Mon Sep 17 00:00:00 2001
From: saieiei <sairudra60 at gmail.com>
Date: Mon, 25 May 2026 12:23:14 -0500
Subject: [PATCH 1/3] [mlir][OpenMP] Translate task_reduction on taskgroup
Add LLVM IR translation support for the task_reduction clause on
omp.taskgroup.
The translation builds task-reduction descriptors for the listed reduction
variables and emits the runtime initialization before the taskgroup body.
The reducer init and combiner callbacks are generated from the corresponding
omp.declare_reduction regions.
This patch keeps taskloop reduction and in_reduction translation unsupported;
those remain follow-up work. Unsupported task_reduction forms are diagnosed
instead of being lowered incorrectly.
Add MLIR translation tests for taskgroup task_reduction, multiple reducers,
plain taskgroup translation, and remaining unsupported cases.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 243 +++++++++++++++++-
.../openmp-taskgroup-task-reduction.mlir | 153 +++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 58 ++++-
3 files changed, 444 insertions(+), 10 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-taskgroup-task-reduction.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f0511bb4be7dd..dfec09a53075c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -370,10 +370,13 @@ static LogicalResult checkImplementationStatus(Operation &op) {
op.getReductionMod().value() != omp::ReductionModifier::defaultmod)
result = todo("reduction with modifier");
};
- auto checkTaskReduction = [&todo](auto op, LogicalResult &result) {
- if (!op.getTaskReductionVars().empty() || op.getTaskReductionByref() ||
- op.getTaskReductionSyms())
- result = todo("task_reduction");
+ auto checkTaskReductionByref = [&todo](auto op, LogicalResult &result) {
+ if (auto byrefAttr = op.getTaskReductionByref())
+ for (bool isByRef : *byrefAttr)
+ if (isByRef) {
+ result = todo("task_reduction with byref modifier");
+ return;
+ }
};
auto checkNumTeams = [&todo](auto op, LogicalResult &result) {
if (op.hasNumTeamsMultiDim())
@@ -426,7 +429,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
})
.Case([&](omp::TaskgroupOp op) {
checkAllocate(op, result);
- checkTaskReduction(op, result);
+ checkTaskReductionByref(op, result);
})
.Case([&](omp::TaskwaitOp op) {
checkDepend(op, result);
@@ -3643,6 +3646,183 @@ convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
return success();
}
+/// Build an outlined init helper for a task_reduction declare_reduction op.
+/// Signature: void(ptr %priv, ptr %orig). For non-byref reductions, the init
+/// region's mold argument is mapped to the value loaded from %orig, and the
+/// yielded scalar is stored into %priv.
+static llvm::Function *
+emitTaskReductionInitFn(omp::DeclareReductionOp decl, StringRef baseName,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ llvm::Module *llvmModule = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &ctx = llvmModule->getContext();
+ llvm::Type *voidTy = llvm::Type::getVoidTy(ctx);
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(ctx);
+ llvm::FunctionType *fty =
+ llvm::FunctionType::get(voidTy, {ptrTy, ptrTy}, false);
+ llvm::Function *fn =
+ llvm::Function::Create(fty, llvm::GlobalValue::InternalLinkage,
+ baseName + ".red.init", llvmModule);
+ fn->setDoesNotRecurse();
+ fn->getArg(0)->setName("priv");
+ fn->getArg(1)->setName("orig");
+
+ llvm::BasicBlock *entry = llvm::BasicBlock::Create(ctx, "entry", fn);
+ llvm::IRBuilder<> b(entry);
+
+ llvm::Type *elemTy = moduleTranslation.convertType(decl.getType());
+ llvm::Value *origVal = b.CreateLoad(elemTy, fn->getArg(1), "omp.orig");
+ moduleTranslation.mapValue(decl.getInitializerMoldArg(), origVal);
+ SmallVector<llvm::Value *, 1> phis;
+ if (failed(inlineConvertOmpRegions(decl.getInitializerRegion(),
+ "omp.taskred.init", b, moduleTranslation,
+ &phis))) {
+ fn->eraseFromParent();
+ return nullptr;
+ }
+ assert(phis.size() == 1 &&
+ "expected one value yielded from reduction initializer");
+ b.CreateStore(phis[0], fn->getArg(0));
+ b.CreateRetVoid();
+
+ moduleTranslation.forgetMapping(decl.getInitializerRegion());
+ return fn;
+}
+
+/// Build an outlined combiner helper for a task_reduction declare_reduction op.
+/// Signature: void(ptr %lhs, ptr %rhs). For non-byref reductions, the values
+/// at *%lhs and *%rhs are loaded, fed into the combiner region, and the
+/// yielded scalar is stored back into *%lhs.
+static llvm::Function *
+emitTaskReductionCombFn(omp::DeclareReductionOp decl, StringRef baseName,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ llvm::Module *llvmModule = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &ctx = llvmModule->getContext();
+ llvm::Type *voidTy = llvm::Type::getVoidTy(ctx);
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(ctx);
+ llvm::FunctionType *fty =
+ llvm::FunctionType::get(voidTy, {ptrTy, ptrTy}, false);
+ llvm::Function *fn =
+ llvm::Function::Create(fty, llvm::GlobalValue::InternalLinkage,
+ baseName + ".red.comb", llvmModule);
+ fn->setDoesNotRecurse();
+ fn->getArg(0)->setName("lhs");
+ fn->getArg(1)->setName("rhs");
+
+ llvm::BasicBlock *entry = llvm::BasicBlock::Create(ctx, "entry", fn);
+ llvm::IRBuilder<> b(entry);
+
+ llvm::Type *elemTy = moduleTranslation.convertType(decl.getType());
+ Block &combBlock = decl.getReductionRegion().front();
+ assert(combBlock.getNumArguments() == 2 &&
+ "expected two arguments in declare_reduction combiner");
+ llvm::Value *lhsVal = b.CreateLoad(elemTy, fn->getArg(0), "omp.lhs");
+ llvm::Value *rhsVal = b.CreateLoad(elemTy, fn->getArg(1), "omp.rhs");
+ moduleTranslation.mapValue(combBlock.getArgument(0), lhsVal);
+ moduleTranslation.mapValue(combBlock.getArgument(1), rhsVal);
+
+ SmallVector<llvm::Value *, 1> phis;
+ if (failed(inlineConvertOmpRegions(decl.getReductionRegion(),
+ "omp.taskred.comb", b, moduleTranslation,
+ &phis))) {
+ fn->eraseFromParent();
+ return nullptr;
+ }
+ assert(phis.size() == 1 &&
+ "expected one value yielded from reduction combiner");
+ b.CreateStore(phis[0], fn->getArg(0));
+ b.CreateRetVoid();
+
+ moduleTranslation.forgetMapping(decl.getReductionRegion());
+ return fn;
+}
+
+/// Emit the per-taskgroup task_reduction descriptor array and the
+/// `__kmpc_taskred_init` runtime call. Must be called with `builder` set to a
+/// point inside the taskgroup body (after `__kmpc_taskgroup`). The descriptor
+/// array itself is allocated at \p allocaIP.
+///
+/// Only the non-byref form is handled here. Byref task_reduction has already
+/// been rejected by `checkImplementationStatus`.
+static LogicalResult emitTaskgroupTaskReductionInit(
+ omp::TaskgroupOp tgOp, ArrayRef<omp::DeclareReductionOp> redDecls,
+ llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointTy allocaIP,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+ llvm::Module *llvmModule = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &ctx = llvmModule->getContext();
+ const llvm::DataLayout &dl = llvmModule->getDataLayout();
+
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(ctx);
+ llvm::Type *i32Ty = llvm::Type::getInt32Ty(ctx);
+ llvm::Type *sizeTy =
+ llvm::Type::getIntNTy(ctx, dl.getPointerSizeInBits(/*AddrSpace=*/0));
+
+ // Identified `kmp_taskred_input_t` struct, matching the layout used by
+ // Clang's CGOpenMPRuntime::emitTaskReductionInit.
+ llvm::StructType *redInputTy =
+ llvm::StructType::getTypeByName(ctx, "kmp_taskred_input_t");
+ if (!redInputTy)
+ redInputTy = llvm::StructType::create(
+ ctx, {ptrTy, ptrTy, sizeTy, ptrTy, ptrTy, ptrTy, i32Ty},
+ "kmp_taskred_input_t");
+
+ unsigned n = redDecls.size();
+ llvm::ArrayType *arrTy = llvm::ArrayType::get(redInputTy, n);
+
+ // Allocate the descriptor array in the enclosing function's alloca block.
+ llvm::AllocaInst *arrAlloca;
+ {
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ builder.restoreIP(allocaIP);
+ arrAlloca =
+ builder.CreateAlloca(arrTy, /*ArraySize=*/nullptr, ".taskred.input");
+ }
+
+ // Fill each descriptor entry inside the taskgroup body.
+ llvm::Value *zero = builder.getInt32(0);
+ for (unsigned i = 0; i < n; ++i) {
+ omp::DeclareReductionOp decl = redDecls[i];
+ llvm::Value *orig =
+ moduleTranslation.lookupValue(tgOp.getTaskReductionVars()[i]);
+ llvm::Type *elemTy = moduleTranslation.convertType(decl.getType());
+ uint64_t size = dl.getTypeAllocSize(elemTy).getFixedValue();
+
+ std::string baseName =
+ (llvm::Twine("__omp_taskred_") + decl.getSymName()).str();
+ llvm::Function *initFn =
+ emitTaskReductionInitFn(decl, baseName, moduleTranslation);
+ llvm::Function *combFn =
+ emitTaskReductionCombFn(decl, baseName, moduleTranslation);
+ if (!initFn || !combFn)
+ return failure();
+ llvm::Value *elemPtr = builder.CreateInBoundsGEP(
+ arrTy, arrAlloca, {zero, builder.getInt32(i)}, ".taskred.elem");
+ auto storeField = [&](unsigned fieldIdx, llvm::Value *val) {
+ llvm::Value *fieldPtr =
+ builder.CreateStructGEP(redInputTy, elemPtr, fieldIdx);
+ builder.CreateStore(val, fieldPtr);
+ };
+ storeField(0, orig); // reduce_shar
+ storeField(1, orig); // reduce_orig
+ storeField(2, llvm::ConstantInt::get(sizeTy, size)); // reduce_size
+ storeField(3, initFn); // reduce_init
+ storeField(4, llvm::ConstantPointerNull::get(ptrTy)); // reduce_fini
+ storeField(5, combFn); // reduce_comb
+ storeField(6, llvm::ConstantInt::get(i32Ty, 0)); // flags
+ }
+
+ // Emit call: __kmpc_taskred_init(gtid, num, &arr).
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr =
+ ompBuilder->getOrCreateDefaultSrcLocStr(srcLocSize);
+ llvm::Value *ident = ompBuilder->getOrCreateIdent(srcLocStr, srcLocSize);
+ llvm::Value *gtid = ompBuilder->getOrCreateThreadID(ident);
+ llvm::FunctionCallee taskredInit = ompBuilder->getOrCreateRuntimeFunction(
+ *llvmModule, llvm::omp::OMPRTL___kmpc_taskred_init);
+ builder.CreateCall(taskredInit, {gtid, builder.getInt32(n), arrAlloca});
+ return success();
+}
+
/// Converts an OpenMP taskgroup construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
@@ -3651,9 +3831,58 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
if (failed(checkImplementationStatus(*tgOp)))
return failure();
- auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP,
- llvm::ArrayRef<llvm::BasicBlock *> deallocBlocks) {
+ // Resolve and validate task_reduction declarations up front. We only handle
+ // declare_reduction ops shaped like a non-byref scalar reduction in this
+ // first cut; richer shapes (two-argument initializer, cleanup region,
+ // missing combiner) require additional infrastructure.
+ SmallVector<omp::DeclareReductionOp> redDecls;
+ if (auto syms = tgOp.getTaskReductionSyms()) {
+ redDecls.reserve(syms->size());
+ for (auto sym : syms->getAsRange<SymbolRefAttr>()) {
+ auto decl = SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
+ tgOp, sym);
+ if (!decl)
+ return tgOp.emitError()
+ << "failed to resolve task_reduction declare_reduction symbol "
+ << sym.getRootReference() << " in omp.taskgroup";
+ if (decl.getInitializerRegion().front().getNumArguments() != 1)
+ return tgOp.emitError("not yet implemented: task_reduction with "
+ "two-argument initializer in omp.taskgroup");
+ if (!decl.getCleanupRegion().empty())
+ return tgOp.emitError("not yet implemented: task_reduction with "
+ "cleanup region in omp.taskgroup");
+ if (decl.getReductionRegion().empty())
+ return tgOp.emitError("task_reduction declare_reduction is missing a "
+ "combiner region");
+ redDecls.push_back(decl);
+ }
+ }
+
+ auto bodyCB =
+ [&](InsertPointTy allocaIP, InsertPointTy codegenIP,
+ llvm::ArrayRef<llvm::BasicBlock *> deallocBlocks) -> llvm::Error {
builder.restoreIP(codegenIP);
+
+ if (!redDecls.empty()) {
+ if (failed(emitTaskgroupTaskReductionInit(tgOp, redDecls, builder,
+ allocaIP, moduleTranslation)))
+ return llvm::createStringError(
+ llvm::inconvertibleErrorCode(),
+ "failed to emit task_reduction initialization for omp.taskgroup");
+ }
+
+ // Inside the taskgroup body, each task_reduction block argument refers to
+ // the same shared/original storage that the runtime now knows about via
+ // the descriptor array. Inner tasks that declare in_reduction look up
+ // per-task private copies through the runtime; the taskgroup body itself
+ // uses the original variable.
+ for (auto [i, blockArg] :
+ llvm::enumerate(tgOp.getRegion().getArguments())) {
+ llvm::Value *orig =
+ moduleTranslation.lookupValue(tgOp.getTaskReductionVars()[i]);
+ moduleTranslation.mapValue(blockArg, orig);
+ }
+
return convertOmpOpRegions(tgOp.getRegion(), "omp.taskgroup.region",
builder, moduleTranslation)
.takeError();
diff --git a/mlir/test/Target/LLVMIR/openmp-taskgroup-task-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-taskgroup-task-reduction.mlir
new file mode 100644
index 0000000000000..353ce6218d9f3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskgroup-task-reduction.mlir
@@ -0,0 +1,153 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Single scalar task_reduction on omp.taskgroup. Verifies that the
+// kmp_taskred_input_t descriptor is allocated, populated, and handed off to
+// __kmpc_taskred_init, and that init / combiner helper functions are emitted.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @taskgroup_task_reduction_single(%x: !llvm.ptr) {
+ omp.taskgroup task_reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK: %kmp_taskred_input_t = type { ptr, ptr, i64, ptr, ptr, ptr, i32 }
+
+// CHECK-LABEL: define void @taskgroup_task_reduction_single(
+// CHECK-SAME: ptr %[[X:.+]])
+// CHECK: %[[ARR:.+]] = alloca [1 x %kmp_taskred_input_t]
+// CHECK: call void @__kmpc_taskgroup(
+// Descriptor entry 0.
+// CHECK: %[[GEP0:.+]] = getelementptr inbounds [1 x %kmp_taskred_input_t], ptr %[[ARR]], i32 0, i32 0
+// CHECK: %[[SHAR:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 0
+// CHECK: store ptr %[[X]], ptr %[[SHAR]]
+// CHECK: %[[ORIG:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 1
+// CHECK: store ptr %[[X]], ptr %[[ORIG]]
+// CHECK: %[[SZF:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 2
+// CHECK: store i64 4, ptr %[[SZF]]
+// CHECK: %[[INITF:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 3
+// CHECK: store ptr @__omp_taskred_add_i32.red.init, ptr %[[INITF]]
+// CHECK: %[[FINIF:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 4
+// CHECK: store ptr null, ptr %[[FINIF]]
+// CHECK: %[[COMBF:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 5
+// CHECK: store ptr @__omp_taskred_add_i32.red.comb, ptr %[[COMBF]]
+// CHECK: %[[FLAGSF:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[GEP0]], i32 0, i32 6
+// CHECK: store i32 0, ptr %[[FLAGSF]]
+// CHECK: call ptr @__kmpc_taskred_init(i32 %{{.+}}, i32 1, ptr %[[ARR]])
+// CHECK: call void @__kmpc_end_taskgroup(
+
+// CHECK-LABEL: define internal void @__omp_taskred_add_i32.red.init(
+// CHECK-SAME: ptr %priv, ptr %orig)
+// CHECK: load i32, ptr %orig
+// CHECK: store i32 0, ptr %priv
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @__omp_taskred_add_i32.red.comb(
+// CHECK-SAME: ptr %lhs, ptr %rhs)
+// CHECK: %[[L:.+]] = load i32, ptr %lhs
+// CHECK: %[[R:.+]] = load i32, ptr %rhs
+// CHECK: %[[S:.+]] = add i32 %[[L]], %[[R]]
+// CHECK: store i32 %[[S]], ptr %lhs
+// CHECK: ret void
+
+// -----
+
+// Multiple task_reduction items on the same taskgroup.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+omp.declare_reduction @mul_i64 : i64
+init {
+^bb0(%arg0: i64):
+ %c1 = llvm.mlir.constant(1 : i64) : i64
+ omp.yield(%c1 : i64)
+}
+combiner {
+^bb0(%arg0: i64, %arg1: i64):
+ %p = llvm.mul %arg0, %arg1 : i64
+ omp.yield(%p : i64)
+}
+
+llvm.func @taskgroup_task_reduction_multi(%x: !llvm.ptr, %y: !llvm.ptr) {
+ omp.taskgroup task_reduction(@add_i32 %x -> %a, @mul_i64 %y -> %b : !llvm.ptr, !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @taskgroup_task_reduction_multi(
+// CHECK-SAME: ptr %[[XA:[^,)]+]], ptr %[[YA:[^,)]+]])
+// CHECK: %[[ARR2:.+]] = alloca [2 x %kmp_taskred_input_t]
+// CHECK: call void @__kmpc_taskgroup(
+// Descriptor entry 0: @add_i32 on %x.
+// CHECK: %[[E0:.+]] = getelementptr inbounds [2 x %kmp_taskred_input_t], ptr %[[ARR2]], i32 0, i32 0
+// CHECK: %[[E0_SHAR:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 0
+// CHECK: store ptr %[[XA]], ptr %[[E0_SHAR]]
+// CHECK: %[[E0_ORIG:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 1
+// CHECK: store ptr %[[XA]], ptr %[[E0_ORIG]]
+// CHECK: %[[E0_SZ:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 2
+// CHECK: store i64 4, ptr %[[E0_SZ]]
+// CHECK: %[[E0_INIT:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 3
+// CHECK: store ptr @__omp_taskred_add_i32.red.init, ptr %[[E0_INIT]]
+// CHECK: %[[E0_FINI:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 4
+// CHECK: store ptr null, ptr %[[E0_FINI]]
+// CHECK: %[[E0_COMB:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 5
+// CHECK: store ptr @__omp_taskred_add_i32.red.comb, ptr %[[E0_COMB]]
+// CHECK: %[[E0_FL:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E0]], i32 0, i32 6
+// CHECK: store i32 0, ptr %[[E0_FL]]
+// Descriptor entry 1: @mul_i64 on %y.
+// CHECK: %[[E1:.+]] = getelementptr inbounds [2 x %kmp_taskred_input_t], ptr %[[ARR2]], i32 0, i32 1
+// CHECK: %[[E1_SHAR:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 0
+// CHECK: store ptr %[[YA]], ptr %[[E1_SHAR]]
+// CHECK: %[[E1_ORIG:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 1
+// CHECK: store ptr %[[YA]], ptr %[[E1_ORIG]]
+// CHECK: %[[E1_SZ:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 2
+// CHECK: store i64 8, ptr %[[E1_SZ]]
+// CHECK: %[[E1_INIT:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 3
+// CHECK: store ptr @__omp_taskred_mul_i64.red.init, ptr %[[E1_INIT]]
+// CHECK: %[[E1_FINI:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 4
+// CHECK: store ptr null, ptr %[[E1_FINI]]
+// CHECK: %[[E1_COMB:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 5
+// CHECK: store ptr @__omp_taskred_mul_i64.red.comb, ptr %[[E1_COMB]]
+// CHECK: %[[E1_FL:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[E1]], i32 0, i32 6
+// CHECK: store i32 0, ptr %[[E1_FL]]
+// CHECK: call ptr @__kmpc_taskred_init(i32 %{{.+}}, i32 2, ptr %[[ARR2]])
+
+// -----
+
+// Plain taskgroup without task_reduction must still translate (regression
+// guard for the rewrite of convertOmpTaskgroupOp).
+
+llvm.func @taskgroup_plain() {
+ omp.taskgroup {
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @taskgroup_plain()
+// CHECK: call void @__kmpc_taskgroup(
+// CHECK-NOT: call ptr @__kmpc_taskred_init(
+// CHECK: call void @__kmpc_end_taskgroup(
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 295ba54dbfb38..d5d0a96779db0 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -301,10 +301,62 @@ atomic {
llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
omp.yield
}
-llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
- // expected-error at below {{not yet implemented: Unhandled clause task_reduction in omp.taskgroup operation}}
+llvm.func @taskgroup_task_reduction_byref(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause task_reduction with byref modifier in omp.taskgroup operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.taskgroup}}
- omp.taskgroup task_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
+ omp.taskgroup task_reduction(byref @add_f32 %x -> %prv : !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+// -----
+
+omp.declare_reduction @add_i32_cleanup : i32
+init {
+^bb0(%arg: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%a: i32, %b: i32):
+ %s = llvm.add %a, %b : i32
+ omp.yield(%s : i32)
+}
+cleanup {
+^bb0(%a: i32):
+ omp.yield
+}
+llvm.func @taskgroup_task_reduction_cleanup(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: task_reduction with cleanup region in omp.taskgroup}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskgroup}}
+ omp.taskgroup task_reduction(@add_i32_cleanup %x -> %prv : !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+// -----
+
+omp.declare_reduction @add_i32_2arg_init : !llvm.ptr
+alloc {
+^bb0(%mold: !llvm.ptr):
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+ omp.yield(%0 : !llvm.ptr)
+}
+init {
+^bb0(%mold: !llvm.ptr, %alloc: !llvm.ptr):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ llvm.store %c0, %alloc : i32, !llvm.ptr
+ omp.yield(%alloc : !llvm.ptr)
+}
+combiner {
+^bb0(%a: !llvm.ptr, %b: !llvm.ptr):
+ omp.yield(%a : !llvm.ptr)
+}
+llvm.func @taskgroup_task_reduction_two_arg_init(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: task_reduction with two-argument initializer in omp.taskgroup}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskgroup}}
+ omp.taskgroup task_reduction(@add_i32_2arg_init %x -> %prv : !llvm.ptr) {
omp.terminator
}
llvm.return
>From 1ca9edc6fc44ae75eda94bf65e0cd4d2bca2620c Mon Sep 17 00:00:00 2001
From: Sairudra More <moresair at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 26 May 2026 04:30:16 -0500
Subject: [PATCH 2/3] [mlir][OpenMP] Translate reductions on taskloop
Add LLVM IR translation for reduction and in_reduction clauses on omp.taskloop.context.
For taskloop reduction, emit the implicit taskgroup reduction setup and map each generated task to runtime-provided private reduction storage through __kmpc_task_reduction_get_th_data. For in_reduction, use the same runtime lookup path with a null descriptor to join an enclosing task reduction context.
Unsupported byref, cleanup, and two-argument initializer forms remain diagnosed.
Add MLIR translation tests for the supported taskloop reduction and in_reduction cases.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 227 ++++++++++++++--
.../LLVMIR/openmp-taskloop-reduction.mlir | 245 ++++++++++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 102 +++++++-
3 files changed, 543 insertions(+), 31 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-taskloop-reduction.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index dfec09a53075c..1120d9fc38d0a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -362,7 +362,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
result = todo("privatization");
};
auto checkReduction = [&todo](auto op, LogicalResult &result) {
- if (isa<omp::TeamsOp>(op) || isa<omp::TaskloopContextOp>(op))
+ if (isa<omp::TeamsOp>(op))
if (!op.getReductionVars().empty() || op.getReductionByref() ||
op.getReductionSyms())
result = todo("reduction");
@@ -378,6 +378,22 @@ static LogicalResult checkImplementationStatus(Operation &op) {
return;
}
};
+ auto checkReductionByref = [&todo](auto op, LogicalResult &result) {
+ if (auto byrefAttr = op.getReductionByref())
+ for (bool isByRef : *byrefAttr)
+ if (isByRef) {
+ result = todo("reduction with byref modifier");
+ return;
+ }
+ };
+ auto checkInReductionByref = [&todo](auto op, LogicalResult &result) {
+ if (auto byrefAttr = op.getInReductionByref())
+ for (bool isByRef : *byrefAttr)
+ if (isByRef) {
+ result = todo("in_reduction with byref modifier");
+ return;
+ }
+ };
auto checkNumTeams = [&todo](auto op, LogicalResult &result) {
if (op.hasNumTeamsMultiDim())
result = todo("num_teams with multi-dimensional values");
@@ -437,8 +453,9 @@ static LogicalResult checkImplementationStatus(Operation &op) {
})
.Case([&](omp::TaskloopContextOp op) {
checkAllocate(op, result);
- checkInReduction(op, result);
+ checkInReductionByref(op, result);
checkReduction(op, result);
+ checkReductionByref(op, result);
})
.Case([&](omp::WsloopOp op) {
checkAllocate(op, result);
@@ -3327,6 +3344,15 @@ computeTaskloopBounds(omp::LoopNestOp loopOp, llvm::IRBuilderBase &builder,
return llvm::Error::success();
}
+// Forward declaration: defined alongside the taskgroup task_reduction
+// lowering further down in this file. Shared between omp.taskgroup and
+// omp.taskloop.context translation.
+static llvm::Value *emitTaskReductionInitCall(
+ ArrayRef<omp::DeclareReductionOp> redDecls,
+ ArrayRef<llvm::Value *> origPtrs, StringRef helperNamePrefix,
+ llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointTy allocaIP,
+ LLVM::ModuleTranslation &moduleTranslation);
+
// Converts an OpenMP taskloop construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
@@ -3417,6 +3443,90 @@ convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
// Set up inserttion point for call to createTaskloop()
builder.SetInsertPoint(taskloopStartBlock);
+ // Resolve and validate reduction / in_reduction declarations. Only the
+ // non-byref, single-init-arg, no-cleanup form is supported in this first
+ // cut; richer shapes have been rejected by checkImplementationStatus
+ // (byref) or are rejected here.
+ auto resolveRedDecls =
+ [&](std::optional<ArrayAttr> syms, StringRef clauseName,
+ SmallVectorImpl<omp::DeclareReductionOp> &out) -> LogicalResult {
+ if (!syms)
+ return success();
+ out.reserve(syms->size());
+ for (auto sym : syms->getAsRange<SymbolRefAttr>()) {
+ auto decl = SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
+ contextOp, sym);
+ if (!decl)
+ return contextOp.emitError()
+ << "failed to resolve " << clauseName
+ << " declare_reduction symbol " << sym.getRootReference()
+ << " in omp.taskloop.context";
+ if (decl.getInitializerRegion().front().getNumArguments() != 1)
+ return contextOp.emitError()
+ << "not yet implemented: " << clauseName
+ << " with two-argument initializer in omp.taskloop.context";
+ if (!decl.getCleanupRegion().empty())
+ return contextOp.emitError()
+ << "not yet implemented: " << clauseName
+ << " with cleanup region in omp.taskloop.context";
+ if (decl.getReductionRegion().empty())
+ return contextOp.emitError()
+ << clauseName
+ << " declare_reduction is missing a combiner region";
+ out.push_back(decl);
+ }
+ return success();
+ };
+
+ SmallVector<omp::DeclareReductionOp> redDecls;
+ if (failed(
+ resolveRedDecls(contextOp.getReductionSyms(), "reduction", redDecls)))
+ return failure();
+ SmallVector<omp::DeclareReductionOp> inRedDecls;
+ if (failed(resolveRedDecls(contextOp.getInReductionSyms(), "in_reduction",
+ inRedDecls)))
+ return failure();
+
+ // The op verifier rejects nogroup + reduction, so no check is needed here.
+
+ SmallVector<llvm::Value *> redOrigPtrs;
+ redOrigPtrs.reserve(redDecls.size());
+ for (Value v : contextOp.getReductionVars())
+ redOrigPtrs.push_back(moduleTranslation.lookupValue(v));
+ SmallVector<llvm::Value *> inRedOrigPtrs;
+ inRedOrigPtrs.reserve(inRedDecls.size());
+ for (Value v : contextOp.getInReductionVars())
+ inRedOrigPtrs.push_back(moduleTranslation.lookupValue(v));
+
+ llvm::OpenMPIRBuilder &ompBuilderRef = *moduleTranslation.getOpenMPBuilder();
+ llvm::Module *llvmModuleForRed = moduleTranslation.getLLVMModule();
+
+ // If we have task_reduction items, we must emit our own implicit
+ // __kmpc_taskgroup so that the descriptor returned by __kmpc_taskred_init
+ // is associated with that taskgroup. We then force NoGroup=true so that
+ // OpenMPIRBuilder::createTaskloop does not emit a second taskgroup.
+ bool implicitTaskgroup = !redDecls.empty();
+ llvm::Value *redDesc = nullptr;
+ if (implicitTaskgroup) {
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr =
+ ompBuilderRef.getOrCreateDefaultSrcLocStr(srcLocSize);
+ llvm::Value *ident = ompBuilderRef.getOrCreateIdent(srcLocStr, srcLocSize);
+ llvm::Function *gtidFn = ompBuilderRef.getOrCreateRuntimeFunctionPtr(
+ llvm::omp::OMPRTL___kmpc_global_thread_num);
+ llvm::Value *outerGtid =
+ builder.CreateCall(gtidFn, {ident}, "omp_global_thread_num");
+ llvm::FunctionCallee taskgroupFn = ompBuilderRef.getOrCreateRuntimeFunction(
+ *llvmModuleForRed, llvm::omp::OMPRTL___kmpc_taskgroup);
+ builder.CreateCall(taskgroupFn, {ident, outerGtid});
+
+ redDesc = emitTaskReductionInitCall(redDecls, redOrigPtrs,
+ "__omp_taskloop_taskred_", builder,
+ allocaIP, moduleTranslation);
+ if (!redDesc)
+ return failure();
+ }
+
auto loopOp = cast<omp::LoopNestOp>(loopWrapperOp.getWrappedLoop());
llvm::Value *lbVal = nullptr;
llvm::Value *ubVal = nullptr;
@@ -3491,6 +3601,49 @@ convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
moduleTranslation.mapValue(blockArg, llvmPrivateVar);
}
+ // Map reduction and in_reduction block arguments to the per-task private
+ // storage returned by __kmpc_task_reduction_get_th_data. This call must
+ // be emitted inside the to-be-outlined task body so that it returns the
+ // *executing* thread's gtid (not the encountering thread's). The
+ // taskgroup descriptor `redDesc` is computed in the outer scope and is
+ // auto-captured into the task shareds aggregate by CodeExtractor during
+ // OpenMPIRBuilder::finalize. For in_reduction the descriptor is NULL:
+ // the runtime walks up enclosing taskgroups to find the matching
+ // task_reduction registration for `origPtr`.
+ if (!redDecls.empty() || !inRedDecls.empty()) {
+ auto iface =
+ cast<omp::BlockArgOpenMPOpInterface>(contextOp.getOperation());
+ llvm::OpenMPIRBuilder &ompB = *moduleTranslation.getOpenMPBuilder();
+ llvm::Module *m = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &llvmCtx = m->getContext();
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr = ompB.getOrCreateDefaultSrcLocStr(srcLocSize);
+ llvm::Value *bodyIdent = ompB.getOrCreateIdent(srcLocStr, srcLocSize);
+ llvm::Function *gtidFn = ompB.getOrCreateRuntimeFunctionPtr(
+ llvm::omp::OMPRTL___kmpc_global_thread_num);
+ llvm::Value *bodyGtid =
+ builder.CreateCall(gtidFn, {bodyIdent}, "omp_global_thread_num");
+ llvm::FunctionCallee getThData = ompB.getOrCreateRuntimeFunction(
+ *m, llvm::omp::OMPRTL___kmpc_task_reduction_get_th_data);
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(llvmCtx);
+
+ ArrayRef<BlockArgument> redBlockArgs = iface.getReductionBlockArgs();
+ for (auto [blockArg, origPtr] :
+ llvm::zip_equal(redBlockArgs, redOrigPtrs)) {
+ llvm::Value *priv = builder.CreateCall(
+ getThData, {bodyGtid, redDesc, origPtr}, "omp.taskred.priv");
+ moduleTranslation.mapValue(blockArg, priv);
+ }
+ ArrayRef<BlockArgument> inRedBlockArgs = iface.getInReductionBlockArgs();
+ llvm::Value *nullDesc = llvm::ConstantPointerNull::get(ptrTy);
+ for (auto [blockArg, origPtr] :
+ llvm::zip_equal(inRedBlockArgs, inRedOrigPtrs)) {
+ llvm::Value *priv = builder.CreateCall(
+ getThData, {bodyGtid, nullDesc, origPtr}, "omp.inred.priv");
+ moduleTranslation.mapValue(blockArg, priv);
+ }
+ }
+
// Lower the contents of the taskloop context region: this is the body of
// the generated task, not the loop.
auto continuationBlockOrError = convertOmpOpRegions(
@@ -3626,12 +3779,12 @@ convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
llvm::omp::Directive::OMPD_taskgroup);
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+ bool effectiveNoGroup = contextOp.getNogroup() || implicitTaskgroup;
llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
moduleTranslation.getOpenMPBuilder()->createTaskloop(
ompLoc, allocaIP, deallocBlocks, bodyCB, loopInfo, lbVal, ubVal,
- stepVal, contextOp.getUntied(), ifCond, grainsize,
- contextOp.getNogroup(), sched,
- moduleTranslation.lookupValue(contextOp.getFinal()),
+ stepVal, contextOp.getUntied(), ifCond, grainsize, effectiveNoGroup,
+ sched, moduleTranslation.lookupValue(contextOp.getFinal()),
contextOp.getMergeable(),
moduleTranslation.lookupValue(contextOp.getPriority()),
loopOp.getCollapseNumLoops(), taskDupOrNull,
@@ -3643,6 +3796,23 @@ convertOmpTaskloopContextOp(omp::TaskloopContextOp contextOp,
popCancelFinalizationCB(cancelTerminators, ompBuilder, afterIP.get());
builder.restoreIP(*afterIP);
+
+ // Close the implicit taskgroup we opened for task_reduction. The end call
+ // must execute on the encountering thread, so use the outer-scope gtid.
+ if (implicitTaskgroup) {
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr =
+ ompBuilder.getOrCreateDefaultSrcLocStr(srcLocSize);
+ llvm::Value *ident = ompBuilder.getOrCreateIdent(srcLocStr, srcLocSize);
+ llvm::Function *gtidFn = ompBuilder.getOrCreateRuntimeFunctionPtr(
+ llvm::omp::OMPRTL___kmpc_global_thread_num);
+ llvm::Value *outerGtid =
+ builder.CreateCall(gtidFn, {ident}, "omp_global_thread_num");
+ llvm::FunctionCallee endTgFn = ompBuilder.getOrCreateRuntimeFunction(
+ *moduleTranslation.getLLVMModule(),
+ llvm::omp::OMPRTL___kmpc_end_taskgroup);
+ builder.CreateCall(endTgFn, {ident, outerGtid});
+ }
return success();
}
@@ -3737,16 +3907,25 @@ emitTaskReductionCombFn(omp::DeclareReductionOp decl, StringRef baseName,
}
/// Emit the per-taskgroup task_reduction descriptor array and the
-/// `__kmpc_taskred_init` runtime call. Must be called with `builder` set to a
-/// point inside the taskgroup body (after `__kmpc_taskgroup`). The descriptor
-/// array itself is allocated at \p allocaIP.
+/// `__kmpc_taskred_init` runtime call. \p origPtrs holds the LLVM values for
+/// the original (shared) variables, one per declaration in \p redDecls.
+/// `builder` must be set to the point at which the descriptor stores and the
+/// init call should be emitted; the descriptor array itself is allocated at
+/// \p allocaIP. \p helperNamePrefix is used to disambiguate the generated
+/// init/combiner helper symbol names between taskgroup and taskloop callers.
+///
+/// Returns the `ptr` value produced by `__kmpc_taskred_init` (the taskgroup
+/// reduction handle), or null on failure.
///
-/// Only the non-byref form is handled here. Byref task_reduction has already
+/// Only the non-byref form is handled here. Byref reductions have already
/// been rejected by `checkImplementationStatus`.
-static LogicalResult emitTaskgroupTaskReductionInit(
- omp::TaskgroupOp tgOp, ArrayRef<omp::DeclareReductionOp> redDecls,
+static llvm::Value *emitTaskReductionInitCall(
+ ArrayRef<omp::DeclareReductionOp> redDecls,
+ ArrayRef<llvm::Value *> origPtrs, StringRef helperNamePrefix,
llvm::IRBuilderBase &builder, llvm::OpenMPIRBuilder::InsertPointTy allocaIP,
LLVM::ModuleTranslation &moduleTranslation) {
+ assert(redDecls.size() == origPtrs.size() &&
+ "expected one orig pointer per reduction decl");
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
llvm::Module *llvmModule = moduleTranslation.getLLVMModule();
llvm::LLVMContext &ctx = llvmModule->getContext();
@@ -3778,23 +3957,22 @@ static LogicalResult emitTaskgroupTaskReductionInit(
builder.CreateAlloca(arrTy, /*ArraySize=*/nullptr, ".taskred.input");
}
- // Fill each descriptor entry inside the taskgroup body.
+ // Fill each descriptor entry at the current builder insertion point.
llvm::Value *zero = builder.getInt32(0);
for (unsigned i = 0; i < n; ++i) {
omp::DeclareReductionOp decl = redDecls[i];
- llvm::Value *orig =
- moduleTranslation.lookupValue(tgOp.getTaskReductionVars()[i]);
+ llvm::Value *orig = origPtrs[i];
llvm::Type *elemTy = moduleTranslation.convertType(decl.getType());
uint64_t size = dl.getTypeAllocSize(elemTy).getFixedValue();
std::string baseName =
- (llvm::Twine("__omp_taskred_") + decl.getSymName()).str();
+ (llvm::Twine(helperNamePrefix) + decl.getSymName()).str();
llvm::Function *initFn =
emitTaskReductionInitFn(decl, baseName, moduleTranslation);
llvm::Function *combFn =
emitTaskReductionCombFn(decl, baseName, moduleTranslation);
if (!initFn || !combFn)
- return failure();
+ return nullptr;
llvm::Value *elemPtr = builder.CreateInBoundsGEP(
arrTy, arrAlloca, {zero, builder.getInt32(i)}, ".taskred.elem");
auto storeField = [&](unsigned fieldIdx, llvm::Value *val) {
@@ -3816,11 +3994,14 @@ static LogicalResult emitTaskgroupTaskReductionInit(
llvm::Constant *srcLocStr =
ompBuilder->getOrCreateDefaultSrcLocStr(srcLocSize);
llvm::Value *ident = ompBuilder->getOrCreateIdent(srcLocStr, srcLocSize);
- llvm::Value *gtid = ompBuilder->getOrCreateThreadID(ident);
+ llvm::Function *gtidFn = ompBuilder->getOrCreateRuntimeFunctionPtr(
+ llvm::omp::OMPRTL___kmpc_global_thread_num);
+ llvm::Value *gtid =
+ builder.CreateCall(gtidFn, {ident}, "omp_global_thread_num");
llvm::FunctionCallee taskredInit = ompBuilder->getOrCreateRuntimeFunction(
*llvmModule, llvm::omp::OMPRTL___kmpc_taskred_init);
- builder.CreateCall(taskredInit, {gtid, builder.getInt32(n), arrAlloca});
- return success();
+ return builder.CreateCall(taskredInit, {gtid, builder.getInt32(n), arrAlloca},
+ ".taskred.desc");
}
/// Converts an OpenMP taskgroup construct into LLVM IR using OpenMPIRBuilder.
@@ -3864,8 +4045,12 @@ convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
builder.restoreIP(codegenIP);
if (!redDecls.empty()) {
- if (failed(emitTaskgroupTaskReductionInit(tgOp, redDecls, builder,
- allocaIP, moduleTranslation)))
+ SmallVector<llvm::Value *> origPtrs;
+ origPtrs.reserve(redDecls.size());
+ for (Value v : tgOp.getTaskReductionVars())
+ origPtrs.push_back(moduleTranslation.lookupValue(v));
+ if (!emitTaskReductionInitCall(redDecls, origPtrs, "__omp_taskred_",
+ builder, allocaIP, moduleTranslation))
return llvm::createStringError(
llvm::inconvertibleErrorCode(),
"failed to emit task_reduction initialization for omp.taskgroup");
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-reduction.mlir
new file mode 100644
index 0000000000000..0043f75bfe227
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-reduction.mlir
@@ -0,0 +1,245 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Single scalar reduction on omp.taskloop.context. The lowering must:
+// 1. Emit an implicit __kmpc_taskgroup in the encountering function (since
+// the user did not write nogroup);
+// 2. Build a kmp_taskred_input_t descriptor array and call
+// __kmpc_taskred_init, capturing the returned descriptor handle;
+// 3. Force nogroup=1 on the inner __kmpc_taskloop call so that the
+// OpenMPIRBuilder does not emit a second taskgroup;
+// 4. Inside the outlined task body, call __kmpc_global_thread_num to obtain
+// the executing thread's gtid, then look up the per-task private storage
+// via __kmpc_task_reduction_get_th_data(gtid, redDesc, orig);
+// 5. Close the implicit taskgroup with __kmpc_end_taskgroup.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @taskloop_reduction_single(%x : !llvm.ptr, %lb : i32, %ub : i32, %step : i32) {
+ omp.taskloop.context reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %s = llvm.add %v, %iv : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK: %kmp_taskred_input_t = type { ptr, ptr, i64, ptr, ptr, ptr, i32 }
+
+// Encountering function emits taskgroup + descriptor + taskred_init.
+// CHECK-LABEL: define void @taskloop_reduction_single(
+// CHECK-SAME: ptr %[[X:[^,]+]],
+// CHECK: %[[ARR:.+]] = alloca [1 x %kmp_taskred_input_t]
+// CHECK: call void @__kmpc_taskgroup(
+// CHECK: %[[ELEM:.+]] = getelementptr inbounds [1 x %kmp_taskred_input_t], ptr %[[ARR]], i32 0, i32 0
+// CHECK: %[[SHAR:.+]] = getelementptr {{.+}} %kmp_taskred_input_t, ptr %[[ELEM]], i32 0, i32 0
+// CHECK: store ptr %[[X]], ptr %[[SHAR]]
+// CHECK: store ptr @__omp_taskloop_taskred_add_i32.red.init
+// CHECK: store ptr @__omp_taskloop_taskred_add_i32.red.comb
+// CHECK: %[[DESC:.+]] = call ptr @__kmpc_taskred_init(i32 %{{.+}}, i32 1, ptr %[[ARR]])
+// The returned descriptor is stored into the structArg captured by
+// __kmpc_omp_task_alloc so the outlined task body can load it back.
+// CHECK: store ptr %[[DESC]], ptr %{{.+}}
+// __kmpc_taskloop must be called with nogroup=1 because we already opened
+// our own taskgroup above.
+// CHECK: call void @__kmpc_taskloop(ptr {{.+}}, i32 {{.+}}, ptr {{.+}}, i32 1,
+// CHECK: call void @__kmpc_end_taskgroup(
+
+// Outlined task body looks up per-task storage via the runtime, passing the
+// reloaded descriptor (not null) as the second argument.
+// CHECK-LABEL: define internal void @taskloop_reduction_single..omp_par(
+// CHECK: %[[BODY_DESC:.+]] = load ptr, ptr %gep_.taskred.desc
+// CHECK: %[[BODY_ORIG:.+]] = load ptr, ptr %gep_,
+// CHECK: %[[BODY_GTID:.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[BODY_GTID]], ptr %[[BODY_DESC]], ptr %[[BODY_ORIG]])
+// CHECK: load i32, ptr %[[PRIV]]
+// CHECK: store i32 %{{.+}}, ptr %[[PRIV]]
+
+// -----
+
+// Multiple reductions: each entry in the descriptor array gets distinct
+// init / combiner helpers and the body issues one
+// __kmpc_task_reduction_get_th_data per reduction.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+omp.declare_reduction @mul_i64 : i64
+init {
+^bb0(%arg0: i64):
+ %c1 = llvm.mlir.constant(1 : i64) : i64
+ omp.yield(%c1 : i64)
+}
+combiner {
+^bb0(%arg0: i64, %arg1: i64):
+ %p = llvm.mul %arg0, %arg1 : i64
+ omp.yield(%p : i64)
+}
+
+llvm.func @taskloop_reduction_multi(%x : !llvm.ptr, %y : !llvm.ptr, %lb : i32, %ub : i32, %step : i32) {
+ omp.taskloop.context reduction(@add_i32 %x -> %a, @mul_i64 %y -> %b : !llvm.ptr, !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ %va = llvm.load %a : !llvm.ptr -> i32
+ %vai = llvm.add %va, %iv : i32
+ llvm.store %vai, %a : i32, !llvm.ptr
+ %vb = llvm.load %b : !llvm.ptr -> i64
+ %iv64 = llvm.sext %iv : i32 to i64
+ %vbi = llvm.mul %vb, %iv64 : i64
+ llvm.store %vbi, %b : i64, !llvm.ptr
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @taskloop_reduction_multi(
+// CHECK: %[[ARR2:.+]] = alloca [2 x %kmp_taskred_input_t]
+// CHECK: call void @__kmpc_taskgroup(
+// CHECK: store i64 4
+// CHECK: store ptr @__omp_taskloop_taskred_add_i32.red.init
+// CHECK: store ptr @__omp_taskloop_taskred_add_i32.red.comb
+// CHECK: store i64 8
+// CHECK: store ptr @__omp_taskloop_taskred_mul_i64.red.init
+// CHECK: store ptr @__omp_taskloop_taskred_mul_i64.red.comb
+// CHECK: %[[DESC2:.+]] = call ptr @__kmpc_taskred_init(i32 %{{.+}}, i32 2, ptr %[[ARR2]])
+// The descriptor is captured into structArg so the outlined task can reload it.
+// CHECK: store ptr %[[DESC2]], ptr %{{.+}}
+// CHECK: call void @__kmpc_taskloop(ptr {{.+}}, i32 {{.+}}, ptr {{.+}}, i32 1,
+// CHECK: call void @__kmpc_end_taskgroup(
+
+// CHECK-LABEL: define internal void @taskloop_reduction_multi..omp_par(
+// CHECK: %[[BODY_GTID2:.+]] = call i32 @__kmpc_global_thread_num(
+// Both get_th_data calls share the same body gtid; the descriptor argument
+// must be a reloaded SSA value (not null).
+// CHECK: call ptr @__kmpc_task_reduction_get_th_data(i32 %[[BODY_GTID2]], ptr %{{[^,]+}}, ptr %{{.+}})
+// CHECK: call ptr @__kmpc_task_reduction_get_th_data(i32 %[[BODY_GTID2]], ptr %{{[^,]+}}, ptr %{{.+}})
+
+// -----
+
+// in_reduction on omp.taskloop.context nested inside an outer taskgroup
+// task_reduction. No new __kmpc_taskgroup must be emitted for the taskloop
+// itself (the user did not write reduction on it), and the get_th_data call
+// must pass a NULL descriptor so the runtime walks up to the enclosing
+// taskgroup.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @taskloop_inreduction(%x : !llvm.ptr, %lb : i32, %ub : i32, %step : i32) {
+ omp.taskgroup task_reduction(@add_i32 %x -> %tg : !llvm.ptr) {
+ omp.taskloop.context in_reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %s = llvm.add %v, %iv : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @taskloop_inreduction(
+// Outer taskgroup opens once; we expect only ONE __kmpc_taskgroup for the
+// outer construct (the taskloop itself must not open a second one).
+// CHECK: call void @__kmpc_taskgroup(
+// CHECK-NOT: call void @__kmpc_taskgroup(
+// The outer descriptor is built; the taskloop must NOT build its own
+// taskred_init.
+// CHECK: call ptr @__kmpc_taskred_init(
+// CHECK-NOT: call ptr @__kmpc_taskred_init(
+// CHECK: call void @__kmpc_taskloop(
+// CHECK: call void @__kmpc_end_taskgroup(
+
+// In the outlined taskloop task body, the in_reduction lookup passes NULL
+// as the descriptor argument so the runtime walks up enclosing taskgroups.
+// CHECK-LABEL: define internal void @taskloop_inreduction..omp_par(
+// CHECK: call i32 @__kmpc_global_thread_num(
+// CHECK: call ptr @__kmpc_task_reduction_get_th_data(i32 %{{.+}}, ptr null, ptr %{{.+}})
+
+// -----
+
+// nogroup + in_reduction: the user wrote `nogroup` on the taskloop and only an
+// in_reduction clause, so the translator must NOT open an implicit taskgroup
+// and must NOT build a taskred descriptor for the taskloop itself; `nogroup`
+// must be propagated to __kmpc_taskloop as 1, and the outlined body must look
+// up the participant with a NULL descriptor so the runtime walks up.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @taskloop_nogroup_inreduction(%x : !llvm.ptr, %lb : i32, %ub : i32, %step : i32) {
+ omp.taskloop.context nogroup in_reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %s = llvm.add %v, %iv : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// Outer caller: no implicit taskgroup, no taskred_init, nogroup=1 to taskloop.
+// CHECK-LABEL: define void @taskloop_nogroup_inreduction(
+// CHECK-NOT: call void @__kmpc_taskgroup(
+// CHECK-NOT: call ptr @__kmpc_taskred_init(
+// CHECK-NOT: call void @__kmpc_end_taskgroup(
+// CHECK: call void @__kmpc_taskloop(ptr {{[^,]+}}, i32 {{[^,]+}}, ptr {{[^,]+}}, i32 1,
+
+// In the outlined task body, the in_reduction lookup uses a NULL descriptor.
+// CHECK-LABEL: define internal void @taskloop_nogroup_inreduction..omp_par(
+// CHECK: call i32 @__kmpc_global_thread_num(
+// CHECK: call ptr @__kmpc_task_reduction_get_th_data(i32 %{{.+}}, ptr null, ptr %{{.+}})
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index d5d0a96779db0..5c22f7f081bb5 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -378,20 +378,20 @@ llvm.func @taskloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr)
}
// -----
- omp.declare_reduction @add_reduction_i32 : i32 init {
+omp.declare_reduction @add_reduction_i32 : i32 init {
^bb0(%arg0: i32):
%0 = llvm.mlir.constant(0 : i32) : i32
omp.yield(%0 : i32)
- }combiner {
+ } combiner {
^bb0(%arg0: i32, %arg1: i32):
%0 = llvm.add %arg0, %arg1 : i32
omp.yield(%0 : i32)
}
-llvm.func @taskloop_inreduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+llvm.func @taskloop_inreduction_byref(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause in_reduction with byref modifier in omp.taskloop.context operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.taskloop.context}}
- // expected-error at below {{not yet implemented: Unhandled clause in_reduction in omp.taskloop.context operation}}
- omp.taskloop.context in_reduction(@add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.taskloop.context in_reduction(byref @add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
omp.taskloop.wrapper {
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
omp.yield
@@ -403,20 +403,102 @@ llvm.func @taskloop_inreduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.pt
}
// -----
- omp.declare_reduction @add_reduction_i32 : i32 init {
+omp.declare_reduction @add_reduction_i32 : i32 init {
^bb0(%arg0: i32):
%0 = llvm.mlir.constant(0 : i32) : i32
omp.yield(%0 : i32)
- }combiner {
+ } combiner {
^bb0(%arg0: i32, %arg1: i32):
%0 = llvm.add %arg0, %arg1 : i32
omp.yield(%0 : i32)
}
-llvm.func @taskloop_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+llvm.func @taskloop_reduction_byref(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause reduction with byref modifier in omp.taskloop.context operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.taskloop.context}}
- // expected-error at below {{not yet implemented: Unhandled clause reduction in omp.taskloop.context operation}}
- omp.taskloop.context reduction(@add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.taskloop.context reduction(byref @add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// -----
+omp.declare_reduction @add_reduction_cleanup_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+ } combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ } cleanup {
+ ^bb0(%arg0: i32):
+ omp.yield
+ }
+
+llvm.func @taskloop_reduction_cleanup(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: reduction with cleanup region in omp.taskloop.context}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop.context}}
+ omp.taskloop.context reduction(@add_reduction_cleanup_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// -----
+
+omp.declare_reduction @add_reduction_modifier_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+ } combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ }
+
+llvm.func @taskloop_reduction_modifier(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause reduction with modifier in omp.taskloop.context operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop.context}}
+ omp.taskloop.context reduction(mod:inscan, @add_reduction_modifier_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.taskloop.wrapper {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+
+// -----
+
+omp.declare_reduction @add_reduction_two_arg_init_i32 : !llvm.ptr alloc {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
+ omp.yield(%1 : !llvm.ptr)
+} init {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ omp.yield(%arg1 : !llvm.ptr)
+} combiner {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ omp.yield(%arg0 : !llvm.ptr)
+}
+
+llvm.func @taskloop_reduction_two_arg_init(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: reduction with two-argument initializer in omp.taskloop.context}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop.context}}
+ omp.taskloop.context reduction(@add_reduction_two_arg_init_i32 %x -> %arg0 : !llvm.ptr) {
omp.taskloop.wrapper {
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
omp.yield
>From 960e149fc2386c531a5b53c5c0798de4c7d66d7f Mon Sep 17 00:00:00 2001
From: Sairudra More <sairudra60 at gmail.com>
Date: Wed, 27 May 2026 06:56:02 -0500
Subject: [PATCH 3/3] [flang][OpenMP] Lower target in_reduction for host
fallback
Teach Flang lowering and MLIR OpenMP translation to carry
in_reduction through omp.target for the host-fallback path.
The translation looks up task reduction-private storage with
__kmpc_task_reduction_get_th_data and binds the target region's
in_reduction block argument to that private pointer, so uses inside the
region do not keep referring to the original variable.
The patch also preserves in_reduction operands in the TargetOp builder
path and ensures target in_reduction list items are mapped into the
target region when needed.
The device/offload-entry path remains diagnosed as not yet implemented.
---
flang/lib/Lower/OpenMP/OpenMP.cpp | 70 ++++++++++++--
.../Lower/OpenMP/Todo/target-inreduction.f90 | 15 ---
.../OpenMP/target-inreduction-unused.f90 | 27 ++++++
.../test/Lower/OpenMP/target-inreduction.f90 | 28 ++++++
mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 15 ++-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 91 ++++++++++++++++++-
mlir/test/Dialect/OpenMP/invalid.mlir | 60 ++++++++++++
.../LLVMIR/openmp-target-in-reduction.mlir | 50 ++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 86 +++++++++++++++++-
9 files changed, 412 insertions(+), 30 deletions(-)
delete mode 100644 flang/test/Lower/OpenMP/Todo/target-inreduction.f90
create mode 100644 flang/test/Lower/OpenMP/target-inreduction-unused.f90
create mode 100644 flang/test/Lower/OpenMP/target-inreduction.f90
create mode 100644 mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 7cb7e379eb503..981458015408b 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -433,13 +433,16 @@ static void bindEntryBlockArgs(lower::AbstractConverter &converter,
.first);
};
- // Process in clause name alphabetical order to match block arguments order.
// Do not bind host_eval variables because they cannot be used inside of the
// corresponding region, except for very specific cases handled separately.
+ // Bind map before in_reduction so that for target in_reduction list items
+ // (which are also implicitly mapped), the in_reduction binding wins and
+ // in-body references use the reduction-private block argument, not the
+ // mapped/original address.
bindMapLike(args.hasDeviceAddr.objects, op.getHasDeviceAddrBlockArgs());
+ bindMapLike(args.map.objects, op.getMapBlockArgs());
bindPrivateLike(args.inReduction.objects, args.inReduction.vars,
op.getInReductionBlockArgs());
- bindMapLike(args.map.objects, op.getMapBlockArgs());
bindPrivateLike(args.priv.objects, args.priv.vars, op.getPrivateBlockArgs());
bindPrivateLike(args.reduction.objects, args.reduction.vars,
op.getReductionBlockArgs());
@@ -1873,6 +1876,7 @@ genTargetClauses(lower::AbstractConverter &converter,
mlir::omp::TargetOperands &clauseOps,
DefaultMapsTy &defaultMaps,
llvm::SmallVectorImpl<Object> &hasDeviceAddrObjects,
+ llvm::SmallVectorImpl<Object> &inReductionObjects,
llvm::SmallVectorImpl<Object> &isDevicePtrObjects,
llvm::SmallVectorImpl<Object> &mapObjects) {
ClauseProcessor cp(converter, semaCtx, clauses);
@@ -1887,13 +1891,14 @@ genTargetClauses(lower::AbstractConverter &converter,
hostEvalInfo->collectValues(clauseOps.hostEvalVars);
}
cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps);
+ cp.processInReduction(loc, clauseOps, inReductionObjects);
cp.processIsDevicePtr(stmtCtx, clauseOps, isDevicePtrObjects);
cp.processMap(loc, stmtCtx, clauseOps, llvm::omp::Directive::OMPD_unknown,
&mapObjects);
cp.processNowait(clauseOps);
cp.processThreadLimit(stmtCtx, clauseOps);
- cp.processTODO<clause::Allocate, clause::InReduction, clause::UsesAllocators>(
+ cp.processTODO<clause::Allocate, clause::UsesAllocators>(
loc, llvm::omp::Directive::OMPD_target);
// `target private(..)` is only supported in delayed privatization mode.
@@ -2932,10 +2937,10 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
mlir::omp::TargetOperands clauseOps;
DefaultMapsTy defaultMaps;
llvm::SmallVector<Object> mapObjects, hasDeviceAddrObjects,
- isDevicePtrObjects;
+ inReductionObjects, isDevicePtrObjects;
genTargetClauses(converter, semaCtx, symTable, stmtCtx, eval, item->clauses,
loc, clauseOps, defaultMaps, hasDeviceAddrObjects,
- isDevicePtrObjects, mapObjects);
+ inReductionObjects, isDevicePtrObjects, mapObjects);
if (!isDevicePtrObjects.empty()) {
// is_device_ptr maps get duplicated so the clause and synthesized
@@ -3108,6 +3113,58 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
Object{const_cast<semantics::Symbol *>(&sym), std::nullopt});
}
};
+ // OpenMP requires `in_reduction` list items on `target` to be implicitly
+ // data-mapped. The MLIR -> LLVM IR translation passes the mapped pointer
+ // as the `orig` argument of `__kmpc_task_reduction_get_th_data`, so the
+ // map must be address-preserving regardless of the scalar default capture
+ // (which would otherwise be ByCopy for small scalars and break the
+ // runtime lookup against the enclosing taskgroup's task_reduction
+ // descriptor). Emit these maps before the generic implicit-map walk so
+ // that walk treats the symbols as already mapped via
+ // `isDuplicateMappedSymbol` and does not downgrade them to ByCopy.
+ auto captureInReductionImplicitMap = [&](const semantics::Symbol &sym) {
+ if (sym.owner().IsDerivedType())
+ return;
+ if (!converter.getSymbolAddress(sym))
+ return;
+ if (isDuplicateMappedSymbol(sym, dsp.getAllSymbolsToPrivatize(),
+ hasDeviceAddrObjects, mapObjects,
+ isDevicePtrObjects))
+ return;
+ if (const auto *details =
+ sym.template detailsIf<semantics::HostAssocDetails>())
+ converter.copySymbolBinding(details->symbol(), sym);
+ std::stringstream name;
+ fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym);
+ name << sym.name().ToString();
+ fir::factory::AddrAndBoundsInfo info =
+ Fortran::lower::getDataOperandBaseAddr(converter, firOpBuilder,
+ sym.GetUltimate(),
+ converter.getCurrentLocation());
+ llvm::SmallVector<mlir::Value> bounds =
+ fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
+ mlir::omp::MapBoundsType>(
+ firOpBuilder, info, dataExv,
+ semantics::IsAssumedSizeArray(sym.GetUltimate()),
+ converter.getCurrentLocation());
+ mlir::Value baseOp = info.rawInput;
+ mlir::omp::ClauseMapFlags flags = mlir::omp::ClauseMapFlags::implicit |
+ mlir::omp::ClauseMapFlags::to |
+ mlir::omp::ClauseMapFlags::from;
+ mlir::Value mapOp = createMapInfoOp(
+ firOpBuilder, converter.getCurrentLocation(), baseOp,
+ /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, /*members=*/{},
+ /*membersIndex=*/mlir::ArrayAttr{}, flags,
+ mlir::omp::VariableCaptureKind::ByRef, baseOp.getType(),
+ /*partialMap=*/false, /*mapperId=*/mlir::FlatSymbolRefAttr{});
+ clauseOps.mapVars.push_back(mapOp);
+ mapObjects.push_back(
+ Object{const_cast<semantics::Symbol *>(&sym), std::nullopt});
+ };
+ for (const Object &object : inReductionObjects)
+ if (const semantics::Symbol *sym = object.sym())
+ captureInReductionImplicitMap(*sym);
+
lower::pft::visitAllSymbols(eval, captureImplicitMap);
auto targetOp = mlir::omp::TargetOp::create(firOpBuilder, loc, clauseOps);
@@ -3120,7 +3177,8 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
args.hasDeviceAddr.objects = hasDeviceAddrObjects;
args.hasDeviceAddr.vars = hasDeviceAddrBaseValues;
args.hostEvalVars = clauseOps.hostEvalVars;
- // TODO: Add in_reduction syms and vars.
+ args.inReduction.objects = inReductionObjects;
+ args.inReduction.vars = clauseOps.inReductionVars;
args.map.objects = mapObjects;
args.map.vars = mapBaseValues;
args.priv.objects = makeObjects(dsp.getDelayedPrivSymbols());
diff --git a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90
deleted file mode 100644
index e5a9cffac5a11..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90
+++ /dev/null
@@ -1,15 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `mergeable` clause
-!===============================================================================
-
-! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TARGET construct
-subroutine omp_target_inreduction()
- integer i
- i = 0
- !$omp target in_reduction(+:i)
- i = i + 1
- !$omp end target
-end subroutine omp_target_inreduction
diff --git a/flang/test/Lower/OpenMP/target-inreduction-unused.f90 b/flang/test/Lower/OpenMP/target-inreduction-unused.f90
new file mode 100644
index 0000000000000..cf0d39db3e9a7
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-inreduction-unused.f90
@@ -0,0 +1,27 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! Per the OpenMP spec, an in_reduction list item on a target construct is
+! implicitly data-mapped. The lowering must not rely on the variable being
+! referenced inside the target body to discover that map: here `i` only
+! appears in the in_reduction clause and is never read or written inside
+! the region. Verify that an omp.map.info for `i` is still emitted and
+! flows into the omp.target's map_entries.
+
+!CHECK-LABEL: func.func @_QPomp_target_in_reduction_unused()
+!CHECK: %[[IDECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFomp_target_in_reduction_unusedEi"}
+!CHECK: %[[IMAP:.*]] = omp.map.info var_ptr(%[[IDECL]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<i32> {name = "i"}
+!CHECK: omp.target in_reduction(@{{[^ ]+}} %[[IDECL]]#0 -> %{{[^ ]+}} : !fir.ref<i32>)
+!CHECK-SAME: map_entries(%[[IMAP]] -> %{{[^ ]+}} : !fir.ref<i32>)
+
+subroutine omp_target_in_reduction_unused()
+ interface
+ subroutine sub()
+ end subroutine
+ end interface
+ integer i
+ i = 0
+ !$omp target in_reduction(+:i)
+ call sub()
+ !$omp end target
+end subroutine omp_target_in_reduction_unused
diff --git a/flang/test/Lower/OpenMP/target-inreduction.f90 b/flang/test/Lower/OpenMP/target-inreduction.f90
new file mode 100644
index 0000000000000..3955cacb744c2
--- /dev/null
+++ b/flang/test/Lower/OpenMP/target-inreduction.f90
@@ -0,0 +1,28 @@
+! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+
+! Verify that in_reduction on a target construct is lowered to an
+! omp.target with both an in_reduction clause and an implicit map_entries
+! entry for the same variable. The implicit map captures the original
+! pointer into the target region so the MLIR -> LLVM IR translation can
+! pass it to __kmpc_task_reduction_get_th_data.
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
+
+!CHECK-LABEL: func.func @_QPomp_target_in_reduction()
+!CHECK: %[[IDECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFomp_target_in_reductionEi"}
+!CHECK: %[[IMAP:.*]] = omp.map.info var_ptr(%[[IDECL]]#1 : !fir.ref<i32>, i32) map_clauses(implicit, tofrom) capture(ByRef) -> !fir.ref<i32> {name = "i"}
+!CHECK: omp.target in_reduction(@[[RED_I32_NAME]] %[[IDECL]]#0 -> %[[INARG:[^ ]+]] : !fir.ref<i32>)
+!CHECK-SAME: map_entries(%[[IMAP]] -> %{{[^ ]+}} : !fir.ref<i32>)
+!CHECK: hlfir.declare %[[INARG]]
+!CHECK: omp.terminator
+!CHECK: }
+
+subroutine omp_target_in_reduction()
+ integer i
+ i = 0
+ !$omp target in_reduction(+:i)
+ i = i + 1
+ !$omp end target
+end subroutine omp_target_in_reduction
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 7cef23bdfef18..9daef3368ec4c 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2545,8 +2545,7 @@ LogicalResult TargetUpdateOp::verify() {
void TargetOp::build(OpBuilder &builder, OperationState &state,
const TargetOperands &clauses) {
MLIRContext *ctx = builder.getContext();
- // TODO Store clauses in op: allocateVars, allocatorVars, inReductionVars,
- // inReductionByref, inReductionSyms.
+ // TODO Store clauses in op: allocateVars, allocatorVars.
TargetOp::build(
builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{}, clauses.bare,
makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars,
@@ -2554,9 +2553,10 @@ void TargetOp::build(OpBuilder &builder, OperationState &state,
clauses.device, clauses.dynGroupprivateAccessGroup,
clauses.dynGroupprivateFallback, clauses.dynGroupprivateSize,
clauses.hasDeviceAddrVars, clauses.hostEvalVars, clauses.ifExpr,
- /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr,
- /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars, clauses.mapVars,
- clauses.nowait, clauses.privateVars,
+ clauses.inReductionVars,
+ makeDenseBoolArrayAttr(ctx, clauses.inReductionByref),
+ makeArrayAttr(ctx, clauses.inReductionSyms), clauses.isDevicePtrVars,
+ clauses.mapVars, clauses.nowait, clauses.privateVars,
makeArrayAttr(ctx, clauses.privateSyms), clauses.privateNeedsBarrier,
clauses.threadLimitVars,
/*private_maps=*/nullptr);
@@ -2583,6 +2583,11 @@ LogicalResult TargetOp::verify() {
if (failed(verifyPrivateVarList(*this)))
return failure();
+ if (failed(verifyReductionVarList(*this, getInReductionSyms(),
+ getInReductionVars(),
+ getInReductionByref())))
+ return failure();
+
return verifyPrivateVarsMapping(*this);
}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 1120d9fc38d0a..2ef23a80577d8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -490,7 +490,11 @@ static LogicalResult checkImplementationStatus(Operation &op) {
.Case([&](omp::TargetOp op) {
checkAllocate(op, result);
checkBare(op, result);
- checkInReduction(op, result);
+ // in_reduction(byref(...)) on target is not implemented yet. Other
+ // unsupported in_reduction shapes (cleanup region, two-argument
+ // initializer, missing combiner) and the device-side / offload-entry
+ // cases are diagnosed inline in convertOmpTarget.
+ checkInReductionByref(op, result);
checkThreadLimit(op, result);
})
.Default([](Operation &) {
@@ -8208,6 +8212,61 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
bool isOffloadEntry =
isTargetDevice || !ompBuilder->Config.TargetTriples.empty();
+ // Validate and resolve in_reduction clauses on omp.target. We currently
+ // only support the non-offload host-fallback path: the per-task private
+ // pointer is obtained by calling __kmpc_task_reduction_get_th_data inside
+ // the to-be-outlined target task body. Threading that pointer through the
+ // device kernel argument list is left as follow-up work.
+ SmallVector<llvm::Value *> inRedOrigPtrs;
+ if (!targetOp.getInReductionVars().empty()) {
+ if (isTargetDevice || isOffloadEntry)
+ return opInst.emitError(
+ "not yet implemented: in_reduction clause on omp.target with "
+ "offload / target-device compilation");
+ if (auto inRedSyms = targetOp.getInReductionSyms()) {
+ for (auto sym : inRedSyms->getAsRange<SymbolRefAttr>()) {
+ auto decl =
+ SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
+ targetOp, sym);
+ if (!decl)
+ return targetOp.emitError()
+ << "failed to resolve in_reduction declare_reduction symbol "
+ << sym.getRootReference() << " on omp.target";
+ if (decl.getInitializerRegion().front().getNumArguments() != 1)
+ return targetOp.emitError()
+ << "not yet implemented: in_reduction with two-argument "
+ "initializer on omp.target";
+ if (!decl.getCleanupRegion().empty())
+ return targetOp.emitError()
+ << "not yet implemented: in_reduction with cleanup region "
+ "on omp.target";
+ // The reduction combiner region is intentionally not required here:
+ // the in_reduction lowering on omp.target only locates the per-task
+ // private storage via __kmpc_task_reduction_get_th_data. The combiner
+ // is owned by the enclosing taskgroup's task_reduction registration.
+ }
+ }
+ // Each in_reduction variable must also be captured by the target via a
+ // map_entries entry referring to the same outer SSA value. OMPIRBuilder
+ // outlines the target body and only rewires uses of values that enter
+ // the kernel through the map-derived input set. The runtime call below
+ // uses that same outer SSA value as its `orig` argument, so without a
+ // matching map entry the outlined kernel would reference a value defined
+ // in the host function and fail IR verification.
+ llvm::SmallPtrSet<Value, 4> mappedVarPtrs;
+ for (Value mapV : targetOp.getMapVars())
+ if (auto mapInfo = mapV.getDefiningOp<omp::MapInfoOp>())
+ mappedVarPtrs.insert(mapInfo.getVarPtr());
+ inRedOrigPtrs.reserve(targetOp.getInReductionVars().size());
+ for (Value v : targetOp.getInReductionVars()) {
+ if (!mappedVarPtrs.contains(v))
+ return targetOp.emitError()
+ << "not yet implemented: in_reduction variable on omp.target "
+ "must also be captured by a matching map_entries entry";
+ inRedOrigPtrs.push_back(moduleTranslation.lookupValue(v));
+ }
+ }
+
// For some private variables, the MapsForPrivatizedVariablesPass
// creates MapInfoOp instances. Go through the private variables and
// the mapped variables so that during codegeneration we are able
@@ -8320,6 +8379,36 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
targetOp.getPrivateNeedsBarrier(), &mappedPrivateVars)))
return llvm::make_error<PreviouslyReportedError>();
+ // Map in_reduction block arguments to the per-task private storage
+ // returned by __kmpc_task_reduction_get_th_data. The lookup must run
+ // inside the target task body so the gtid corresponds to the executing
+ // thread. The descriptor argument is NULL: the runtime walks enclosing
+ // taskgroups to locate the matching task_reduction registration for
+ // `origPtr`. Mirrors the in_reduction handling on omp.taskloop.context.
+ ArrayRef<BlockArgument> inRedBlockArgs = argIface.getInReductionBlockArgs();
+ if (!inRedBlockArgs.empty()) {
+ llvm::OpenMPIRBuilder &ompB = *moduleTranslation.getOpenMPBuilder();
+ llvm::Module *m = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &llvmCtx = m->getContext();
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr = ompB.getOrCreateDefaultSrcLocStr(srcLocSize);
+ llvm::Value *bodyIdent = ompB.getOrCreateIdent(srcLocStr, srcLocSize);
+ llvm::Function *gtidFn = ompB.getOrCreateRuntimeFunctionPtr(
+ llvm::omp::OMPRTL___kmpc_global_thread_num);
+ llvm::Value *bodyGtid =
+ builder.CreateCall(gtidFn, {bodyIdent}, "omp_global_thread_num");
+ llvm::FunctionCallee getThData = ompB.getOrCreateRuntimeFunction(
+ *m, llvm::omp::OMPRTL___kmpc_task_reduction_get_th_data);
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(llvmCtx);
+ llvm::Value *nullDesc = llvm::ConstantPointerNull::get(ptrTy);
+ for (auto [blockArg, origPtr] :
+ llvm::zip_equal(inRedBlockArgs, inRedOrigPtrs)) {
+ llvm::Value *priv = builder.CreateCall(
+ getThData, {bodyGtid, nullDesc, origPtr}, "omp.inred.priv");
+ moduleTranslation.mapValue(blockArg, priv);
+ }
+ }
+
LLVM::ModuleTranslation::SaveStack<OpenMPAllocStackFrame> frame(
moduleTranslation, allocaIP, deallocBlocks);
llvm::Expected<llvm::BasicBlock *> exitBlock = convertOmpOpRegions(
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 06ad3d60ea635..7e6793d23ac7d 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -3129,6 +3129,66 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
// -----
+func.func @omp_target_in_reduction_unresolved(%ptr: !llvm.ptr) {
+ // expected-error @below {{op expected symbol reference @add_f32 to point to a reduction declaration}}
+ omp.target in_reduction(@add_f32 %ptr -> %arg0 : !llvm.ptr) {
+ omp.terminator
+ }
+ return
+}
+
+// -----
+
+omp.declare_reduction @add_f32 : f32
+init {
+^bb0(%arg: f32):
+ %0 = arith.constant 0.0 : f32
+ omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+ %1 = arith.addf %arg0, %arg1 : f32
+ omp.yield (%1 : f32)
+}
+
+func.func @omp_target_in_reduction_duplicate(%ptr: !llvm.ptr) {
+ // expected-error @below {{op accumulator variable used more than once}}
+ omp.target in_reduction(@add_f32 %ptr -> %arg0, @add_f32 %ptr -> %arg1 : !llvm.ptr, !llvm.ptr) {
+ omp.terminator
+ }
+ return
+}
+
+// -----
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg: i32):
+ %0 = arith.constant 0 : i32
+ omp.yield (%0 : i32)
+}
+combiner {
+^bb1(%arg0: i32, %arg1: i32):
+ %1 = arith.addi %arg0, %arg1 : i32
+ omp.yield (%1 : i32)
+}
+atomic {
+^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
+ %2 = llvm.load %arg3 : !llvm.ptr -> i32
+ llvm.atomicrmw add %arg2, %2 monotonic : !llvm.ptr, i32
+ omp.yield
+}
+
+func.func @omp_target_in_reduction_type_mismatch(%mem: memref<1xf32>) {
+ // expected-error @below {{op expected accumulator ('memref<1xf32>') to be the same type as reduction declaration ('!llvm.ptr')}}
+ omp.target in_reduction(@add_i32 %mem -> %arg0 : memref<1xf32>) {
+ omp.terminator
+ }
+ return
+}
+
+// -----
+
func.func @omp_distribute_schedule(%chunk_size : i32, %lb : i32, %ub : i32, %step : i32) -> () {
// expected-error @below {{op chunk size set without dist_schedule_static being present}}
"omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 0, 0, 1, 0>}> ({
diff --git a/mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir
new file mode 100644
index 0000000000000..2b3cfd514d82e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-in-reduction.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// in_reduction on omp.target: the in_reduction variable is also captured
+// into the target region as a map entry (the Flang front-end emits this
+// implicit map). Inside the outlined target body the captured pointer is
+// passed to __kmpc_task_reduction_get_th_data with a NULL descriptor;
+// the runtime walks enclosing taskgroups to locate the matching
+// task_reduction registration. The returned pointer is bound to the
+// in_reduction region block argument so subsequent loads/stores inside
+// the region use the private copy.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @target_inreduction(%x : !llvm.ptr) {
+ %m = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr
+ omp.target in_reduction(@add_i32 %x -> %prv : !llvm.ptr) map_entries(%m -> %marg : !llvm.ptr) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %s = llvm.add %v, %c1 : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// The host stub forwards the captured pointer into the outlined target
+// kernel.
+// CHECK-LABEL: define void @target_inreduction(
+// CHECK: call void @__omp_offloading_{{.*}}_target_inreduction_{{.*}}(ptr %{{.+}}, ptr null)
+
+// In the outlined target body the in_reduction private pointer is
+// obtained from the runtime using the captured original pointer; that
+// pointer is then the base of the load and store inside the region.
+// CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_target_inreduction_
+// CHECK-SAME: (ptr %[[CAPT:.+]], ptr %{{.+}})
+// CHECK: %[[GTID:.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[GTID]], ptr null, ptr %[[CAPT]])
+// CHECK: %[[LOADED:.+]] = load i32, ptr %[[PRIV]]
+// CHECK: %[[SUM:.+]] = add i32 %[[LOADED]], 1
+// CHECK: store i32 %[[SUM]], ptr %[[PRIV]]
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 5c22f7f081bb5..6c09fc073e227 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -190,10 +190,90 @@ atomic {
llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
omp.yield
}
-llvm.func @target_in_reduction(%x : !llvm.ptr) {
- // expected-error at below {{not yet implemented: Unhandled clause in_reduction in omp.target operation}}
+llvm.func @target_in_reduction_byref(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause in_reduction with byref modifier in omp.target operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.target}}
- omp.target in_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
+ omp.target in_reduction(byref @add_f32 %x -> %prv : !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+
+// -----
+
+omp.declare_reduction @add_cleanup_f32 : f32
+init {
+^bb0(%arg: f32):
+ %0 = llvm.mlir.constant(0.0 : f32) : f32
+ omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+ %1 = llvm.fadd %arg0, %arg1 : f32
+ omp.yield (%1 : f32)
+}
+cleanup {
+^bb2(%arg2: f32):
+ omp.yield
+}
+llvm.func @target_in_reduction_cleanup(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: in_reduction with cleanup region on omp.target}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.target}}
+ omp.target in_reduction(@add_cleanup_f32 %x -> %prv : !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+
+// -----
+
+omp.declare_reduction @add_two_arg_init_i32 : !llvm.ptr alloc {
+^bb0(%arg: !llvm.ptr):
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 : (i64) -> !llvm.ptr
+ omp.yield(%1 : !llvm.ptr)
+} init {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ llvm.store %0, %arg1 : i32, !llvm.ptr
+ omp.yield(%arg1 : !llvm.ptr)
+} combiner {
+^bb1(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ %0 = llvm.load %arg0 : !llvm.ptr -> i32
+ %1 = llvm.load %arg1 : !llvm.ptr -> i32
+ %2 = llvm.add %0, %1 : i32
+ llvm.store %2, %arg0 : i32, !llvm.ptr
+ omp.yield(%arg0 : !llvm.ptr)
+}
+llvm.func @target_in_reduction_two_arg_init(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: in_reduction with two-argument initializer on omp.target}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.target}}
+ omp.target in_reduction(@add_two_arg_init_i32 %x -> %prv : !llvm.ptr) {
+ omp.terminator
+ }
+ llvm.return
+}
+
+// -----
+
+omp.declare_reduction @add_no_map_f32 : f32
+init {
+^bb0(%arg: f32):
+ %0 = llvm.mlir.constant(0.0 : f32) : f32
+ omp.yield (%0 : f32)
+}
+combiner {
+^bb1(%arg0: f32, %arg1: f32):
+ %1 = llvm.fadd %arg0, %arg1 : f32
+ omp.yield (%1 : f32)
+}
+llvm.func @target_in_reduction_no_map(%x : !llvm.ptr) {
+ // The in_reduction variable %x has no matching map_entries entry. The
+ // outlined target kernel would otherwise reference %x across function
+ // boundaries; the translation must reject this up front.
+ // expected-error at below {{not yet implemented: in_reduction variable on omp.target must also be captured by a matching map_entries entry}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.target}}
+ omp.target in_reduction(@add_no_map_f32 %x -> %prv : !llvm.ptr) {
omp.terminator
}
llvm.return
More information about the llvm-branch-commits
mailing list