[llvm-branch-commits] [mlir] [mlir][OpenMP] Translate explicit task in_reduction (PR #202611)
Sairudra More via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jun 12 04:57:49 PDT 2026
https://github.com/Saieiei updated https://github.com/llvm/llvm-project/pull/202611
>From fab0eab8d7c4162f3f63b2f324bbf501092a5bca Mon Sep 17 00:00:00 2001
From: Sairudra More <sairudra60 at gmail.com>
Date: Tue, 9 Jun 2026 02:44:30 -0500
Subject: [PATCH] [mlir][OpenMP] Translate explicit task in_reduction
Lower in_reduction on explicit omp.task. Inside the outlined task body,
look up the task-private reduction storage with
__kmpc_task_reduction_get_th_data using a null descriptor, matching the
runtime model where the enclosing taskgroup owns the task_reduction
registration.
The in_reduction block arguments are remapped to the returned private
storage so task-body updates target the private reduction copy instead
of the original shared variable.
Byref in_reduction remains guarded by checkImplementationStatus, and
unsupported declare_reduction forms remain rejected.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 53 +++++++-
.../LLVMIR/openmp-task-in-reduction.mlir | 122 ++++++++++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 6 +-
3 files changed, 177 insertions(+), 4 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index f0aa2486d9e9d..03ec98888e6b0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -441,7 +441,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
})
.Case([&](omp::TaskOp op) {
checkAllocate(op, result);
- checkInReduction(op, result);
+ checkInReductionByref(op, result);
})
.Case([&](omp::TaskgroupOp op) {
checkAllocate(op, result);
@@ -3069,6 +3069,22 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
if (failed(buildAffinityData(taskOp, builder, moduleTranslation, ad)))
return llvm::failure();
+ // Resolve and validate in_reduction declarations. Byref in_reduction has
+ // already been rejected by checkImplementationStatus; the helper rejects the
+ // remaining richer declare_reduction shapes (two-argument initializer,
+ // cleanup region, missing combiner). This is pure MLIR symbol-table work and
+ // emits no IR. The matching task_reduction descriptor is registered by an
+ // enclosing taskgroup; here we only look the per-task storage up at runtime.
+ SmallVector<omp::DeclareReductionOp> inRedDecls;
+ if (failed(collectAndValidateTaskloopRedDecls(
+ taskOp.getOperation(), taskOp.getInReductionSyms(), "omp.task",
+ "in_reduction", inRedDecls)))
+ return failure();
+ SmallVector<llvm::Value *> inRedOrigPtrs;
+ inRedOrigPtrs.reserve(inRedDecls.size());
+ for (Value v : taskOp.getInReductionVars())
+ inRedOrigPtrs.push_back(moduleTranslation.lookupValue(v));
+
// Set up for call to createTask()
builder.SetInsertPoint(taskStartBlock);
@@ -3138,6 +3154,41 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
moduleTranslation.mapValue(blockArg, llvmPrivateVar);
}
+ // Map in_reduction block arguments to the per-task private storage returned
+ // by __kmpc_task_reduction_get_th_data. This call must be emitted inside
+ // the to-be-outlined task body so that it returns the *executing* thread's
+ // gtid (not the encountering thread's). The descriptor is NULL: the runtime
+ // walks up enclosing taskgroups to find the matching task_reduction
+ // registration for `origPtr`. The original pointers are auto-captured into
+ // the task shareds aggregate by CodeExtractor during
+ // OpenMPIRBuilder::finalize.
+ if (!inRedDecls.empty()) {
+ auto iface = cast<omp::BlockArgOpenMPOpInterface>(taskOp.getOperation());
+ llvm::OpenMPIRBuilder &ompB = *moduleTranslation.getOpenMPBuilder();
+ llvm::Module *m = moduleTranslation.getLLVMModule();
+ llvm::LLVMContext &llvmCtx = m->getContext();
+ llvm::OpenMPIRBuilder::LocationDescription bodyLoc(builder);
+ uint32_t srcLocSize;
+ llvm::Constant *srcLocStr =
+ ompB.getOrCreateSrcLocStr(bodyLoc, srcLocSize);
+ llvm::Value *bodyIdent = ompB.getOrCreateIdent(srcLocStr, srcLocSize);
+ // Align OpenMPIRBuilder's internal IRBuilder with `builder` so the gtid
+ // call lands inside the to-be-outlined task body.
+ ompB.updateToLocation(bodyLoc);
+ llvm::Value *bodyGtid = ompB.getOrCreateThreadID(bodyIdent);
+ llvm::FunctionCallee getThData = ompB.getOrCreateRuntimeFunction(
+ *m, llvm::omp::OMPRTL___kmpc_task_reduction_get_th_data);
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(llvmCtx);
+ llvm::Value *nullDesc = llvm::ConstantPointerNull::get(ptrTy);
+ ArrayRef<BlockArgument> inRedBlockArgs = iface.getInReductionBlockArgs();
+ for (auto [blockArg, origPtr] :
+ llvm::zip_equal(inRedBlockArgs, inRedOrigPtrs)) {
+ llvm::Value *priv = builder.CreateCall(
+ getThData, {bodyGtid, nullDesc, origPtr}, "omp.inred.priv");
+ moduleTranslation.mapValue(blockArg, priv);
+ }
+ }
+
auto continuationBlockOrError = convertOmpOpRegions(
taskOp.getRegion(), "omp.task.region", builder, moduleTranslation);
if (failed(handleError(continuationBlockOrError, *taskOp)))
diff --git a/mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir
new file mode 100644
index 0000000000000..ae7d43b0c1d44
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-task-in-reduction.mlir
@@ -0,0 +1,122 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// in_reduction on an explicit omp.task. Unlike taskgroup task_reduction, the
+// task does not register a reduction; it participates in a reduction declared
+// by an enclosing taskgroup. The lowering must, inside the outlined task body:
+// 1. Obtain the executing thread's gtid via __kmpc_global_thread_num;
+// 2. Look up the per-task private storage via
+// __kmpc_task_reduction_get_th_data(gtid, null, orig) -- the NULL
+// descriptor makes the runtime walk up enclosing taskgroups to find the
+// matching task_reduction registration for `orig`;
+// 3. Use the returned private pointer for all updates in the task body, never
+// the original shared variable.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @task_in_reduction_single(%x : !llvm.ptr) {
+ omp.task in_reduction(@add_i32 %x -> %prv : !llvm.ptr) {
+ %v = llvm.load %prv : !llvm.ptr -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %s = llvm.add %v, %c1 : i32
+ llvm.store %s, %prv : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// The encountering function must NOT register a reduction: no taskgroup, no
+// descriptor array, and no __kmpc_taskred_init for in_reduction.
+// CHECK-LABEL: define void @task_in_reduction_single(
+// CHECK-NOT: @__kmpc_taskred_init
+// CHECK-NOT: @__kmpc_taskgroup
+
+// Outlined task body looks up per-task storage via the runtime with a NULL
+// descriptor, and updates that private storage (not the original pointer).
+// CHECK-LABEL: define internal void @task_in_reduction_single..omp_par(
+// CHECK: %[[BODY_GEP:.+]] = getelementptr {{.+}}, i32 0, i32 0
+// CHECK: %[[BODY_ORIG:.+]] = load ptr, ptr %[[BODY_GEP]]
+// CHECK: %[[BODY_GTID:.+]] = call i32 @__kmpc_global_thread_num(
+// CHECK: %[[PRIV:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %[[BODY_GTID]], ptr null, ptr %[[BODY_ORIG]])
+// CHECK: %[[LD:.+]] = load i32, ptr %[[PRIV]]
+// CHECK: %[[ADD:.+]] = add i32 %[[LD]], 1
+// CHECK: store i32 %[[ADD]], ptr %[[PRIV]]
+
+// -----
+
+// Multiple in_reduction items: the body issues one
+// __kmpc_task_reduction_get_th_data per item, each with a NULL descriptor.
+
+omp.declare_reduction @add_i32 : i32
+init {
+^bb0(%arg0: i32):
+ %c0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%c0 : i32)
+}
+combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %s = llvm.add %arg0, %arg1 : i32
+ omp.yield(%s : i32)
+}
+
+llvm.func @task_in_reduction_multi(%x : !llvm.ptr, %y : !llvm.ptr) {
+ omp.task in_reduction(@add_i32 %x -> %px, @add_i32 %y -> %py : !llvm.ptr, !llvm.ptr) {
+ %vx = llvm.load %px : !llvm.ptr -> i32
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ %sx = llvm.add %vx, %c1 : i32
+ llvm.store %sx, %px : i32, !llvm.ptr
+ %vy = llvm.load %py : !llvm.ptr -> i32
+ %c2 = llvm.mlir.constant(2 : i32) : i32
+ %sy = llvm.add %vy, %c2 : i32
+ llvm.store %sy, %py : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// Each item is threaded through independently: the two original pointers come
+// from distinct slots of the task shareds aggregate, each is passed to its own
+// __kmpc_task_reduction_get_th_data lookup (NULL descriptor), and each item's
+// body load/store targets only the matching private pointer -- never the
+// original shared pointer.
+// CHECK-LABEL: define internal void @task_in_reduction_multi..omp_par(
+// CHECK: %[[GEP0:.+]] = getelementptr {{.+}}, i32 0, i32 0
+// CHECK: %[[ORIG0:.+]] = load ptr, ptr %[[GEP0]]
+// CHECK: %[[GEP1:.+]] = getelementptr {{.+}}, i32 0, i32 1
+// CHECK: %[[ORIG1:.+]] = load ptr, ptr %[[GEP1]]
+// CHECK: %[[PRIV0:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %{{.+}}, ptr null, ptr %[[ORIG0]])
+// CHECK: %[[PRIV1:.+]] = call ptr @__kmpc_task_reduction_get_th_data(i32 %{{.+}}, ptr null, ptr %[[ORIG1]])
+// CHECK: %[[LDX:.+]] = load i32, ptr %[[PRIV0]]
+// CHECK: %[[ADDX:.+]] = add i32 %[[LDX]], 1
+// CHECK: store i32 %[[ADDX]], ptr %[[PRIV0]]
+// CHECK: %[[LDY:.+]] = load i32, ptr %[[PRIV1]]
+// CHECK: %[[ADDY:.+]] = add i32 %[[LDY]], 2
+// CHECK: store i32 %[[ADDY]], ptr %[[PRIV1]]
+// CHECK-NOT: store i32 %{{.+}}, ptr %[[ORIG0]]
+// CHECK-NOT: store i32 %{{.+}}, ptr %[[ORIG1]]
+
+// -----
+
+// Regression: a plain omp.task with no in_reduction must not emit any
+// __kmpc_task_reduction_get_th_data call.
+
+llvm.func @task_plain(%x : !llvm.ptr) {
+ omp.task {
+ %c1 = llvm.mlir.constant(1 : i32) : i32
+ llvm.store %c1, %x : i32, !llvm.ptr
+ omp.terminator
+ }
+ llvm.return
+}
+
+// CHECK-LABEL: define void @task_plain(
+// CHECK-NOT: @__kmpc_task_reduction_get_th_data
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 5c22f7f081bb5..3d760f95c7ebc 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -262,10 +262,10 @@ atomic {
llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
omp.yield
}
-llvm.func @task_in_reduction(%x : !llvm.ptr) {
- // expected-error at below {{not yet implemented: Unhandled clause in_reduction in omp.task operation}}
+llvm.func @task_in_reduction_byref(%x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause in_reduction with byref modifier in omp.task operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.task}}
- omp.task in_reduction(@add_f32 %x -> %prv : !llvm.ptr) {
+ omp.task in_reduction(byref @add_f32 %x -> %prv : !llvm.ptr) {
omp.terminator
}
llvm.return
More information about the llvm-branch-commits
mailing list