[Mlir-commits] [llvm] [mlir] [mlir][OpenMP] Translation support for taskloop construct (PR #174386)
Tom Eccles
llvmlistbot at llvm.org
Mon Jan 5 03:33:51 PST 2026
https://github.com/tblah updated https://github.com/llvm/llvm-project/pull/174386
>From 000d6da0c7a7fbe739e476efd516315d1109e446 Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <kaviyara2000 at gmail.com>
Date: Fri, 7 Nov 2025 12:49:53 +0530
Subject: [PATCH 01/14] [Flang][OpenMP] Translation support for taskloop
construct
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 16 ++
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 1 +
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 199 ++++++++++++++
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 257 +++++++++++++++++-
mlir/test/Target/LLVMIR/openmp-taskloop.mlir | 151 ++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 15 +-
6 files changed, 622 insertions(+), 17 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-taskloop.mlir
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 05d8a7dd168a3..72c23cf263c9c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1402,6 +1402,22 @@ class OpenMPIRBuilder {
: DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
};
+ /// Generator for `#omp taskloop`
+ ///
+ /// \param Loc The location where the taskloop construct was encountered.
+ /// \param AllocaIP The insertion point to be used for alloca instructions.
+ /// \param BodyGenCB Callback that will generate the region code.
+ /// \param LoopInfo Callback that return the CLI
+ /// \param LBVal Lowerbound value of loop
+ /// \param UBVal Upperbound value of loop
+ /// \param StepVal Step value of loop
+ /// \param Tied True if the task is tied, false if the task is untied.
+ LLVM_ABI InsertPointOrErrorTy createTaskloop(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ BodyGenCallbackTy BodyGenCB,
+ llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true);
+
/// Generator for `#omp task`
///
/// \param Loc The location where the task construct was encountered.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 152a8f727310a..bb12c1558766b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,6 +95,7 @@ __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidP
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
+__OMP_STRUCT_TYPE(Taskloop, kmp_task_info, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr, Int64, Int64, Int64)
__OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
Int8, Int8, Int8, Int32, Int32, Int32, Int32, Int32, Int32)
__OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5e4d4c7e49776..b42d846e3a460 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2025,6 +2025,205 @@ static Value *emitTaskDependencies(
return DepArray;
}
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ BodyGenCallbackTy BodyGenCB,
+ llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> loopInfo,
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) {
+
+ if (!updateToLocation(Loc))
+ return InsertPointTy();
+
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+ Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+ BasicBlock *TaskloopExitBB =
+ splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
+ BasicBlock *TaskloopBodyBB =
+ splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
+ BasicBlock *TaskloopAllocaBB =
+ splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
+
+ InsertPointTy TaskloopAllocaIP =
+ InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
+ InsertPointTy TaskloopBodyIP =
+ InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
+
+ if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
+ return Err;
+
+ llvm::Expected<llvm::CanonicalLoopInfo *> result = loopInfo();
+ if (!result) {
+ return result.takeError();
+ }
+
+ llvm::CanonicalLoopInfo *CLI = result.get();
+ OutlineInfo OI;
+ OI.EntryBB = TaskloopAllocaBB;
+ OI.OuterAllocaBB = AllocaIP.getBlock();
+ OI.ExitBB = TaskloopExitBB;
+
+ // Add the thread ID argument.
+ SmallVector<Instruction *, 4> ToBeDeleted;
+ // dummy instruction to be used as a fake argument
+ OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+ Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
+
+ OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
+ TaskloopAllocaBB, CLI, Loc,
+ ToBeDeleted](Function &OutlinedFn) mutable {
+ // Replace the Stale CI by appropriate RTL function call.
+ assert(OutlinedFn.hasOneUse() &&
+ "there must be a single user for the outlined function");
+ CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+
+ // HasShareds is true if any variables are captured in the outlined region,
+ // false otherwise.
+ bool HasShareds = StaleCI->arg_size() > 1;
+ Builder.SetInsertPoint(StaleCI);
+
+ // Gather the arguments for emitting the runtime call for
+ // @__kmpc_omp_task_alloc
+ Function *TaskAllocFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+ Value *ThreadID = getOrCreateThreadID(Ident);
+
+ // Emit runtime call for @__kmpc_taskgroup
+ Function *TaskgroupFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
+ Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
+
+ // The flags are set to 1 if the task is tied, 0 otherwise.
+ Value *Flags = Builder.getInt32(Tied);
+
+ Value *TaskSize = Builder.getInt64(
+ divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8));
+
+ Value *SharedsSize = Builder.getInt64(0);
+ if (HasShareds) {
+ AllocaInst *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ assert(ArgStructAlloca &&
+ "Unable to find the alloca instruction corresponding to arguments "
+ "for extracted function");
+ StructType *ArgStructType =
+ dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+ assert(ArgStructType && "Unable to find struct type corresponding to "
+ "arguments for extracted function");
+ SharedsSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+ }
+
+ // Emit the @__kmpc_omp_task_alloc runtime call
+ // The runtime call returns a pointer to an area where the task captured
+ // variables must be copied before the task is run (TaskData)
+ CallInst *TaskData = Builder.CreateCall(
+ TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
+ /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
+ /*task_func=*/&OutlinedFn});
+
+ // Get the pointer to loop lb, ub, step from task ptr
+ // and set up the lowerbound,upperbound and step values
+ llvm::Value *lb =
+ Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5);
+ // Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
+ Builder.CreateStore(LBVal, lb);
+
+ llvm::Value *ub =
+ Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6);
+ Builder.CreateStore(UBVal, ub);
+
+ llvm::Value *step =
+ Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7);
+ Value *Step_ext = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
+ Builder.CreateStore(Step_ext, step);
+ llvm::Value *loadstep = Builder.CreateLoad(Builder.getInt64Ty(), step);
+
+ if (HasShareds) {
+ Value *Shareds = StaleCI->getArgOperand(1);
+ Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+ Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+ Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+ SharedsSize);
+ }
+
+ // set up the arguments for emitting kmpc_taskloop runtime call
+ // setting default values for ifval, nogroup, sched, grainsize, task_dup
+ Value *IfVal = Builder.getInt32(1);
+ Value *NoGroup = Builder.getInt32(1);
+ Value *Sched = Builder.getInt32(0);
+ Value *GrainSize = Builder.getInt64(0);
+ Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
+
+ Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub,
+ loadstep, NoGroup, Sched, GrainSize, TaskDup};
+
+ // taskloop runtime call
+ Function *TaskloopFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
+ Builder.CreateCall(TaskloopFn, Args);
+
+ // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
+ Function *EndTaskgroupFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
+ Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
+
+ StaleCI->eraseFromParent();
+
+ Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
+
+ if (HasShareds) {
+ LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
+ OutlinedFn.getArg(1)->replaceUsesWithIf(
+ Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
+ }
+
+ Value *IV = CLI->getIndVar();
+ Type *IVTy = IV->getType();
+ Constant *One = ConstantInt::get(IVTy, 1);
+
+ Value *task_lb = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 5, "gep_lb");
+ Value *LowerBound = Builder.CreateLoad(IVTy, task_lb, "lb");
+
+ Value *task_ub = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 6, "gep_ub");
+ Value *UpperBound = Builder.CreateLoad(IVTy, task_ub, "ub");
+
+ Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
+
+ Value *TripCountMinusOne = Builder.CreateSub(UpperBound, LowerBound);
+ Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
+ // set the trip count in the CLI
+ CLI->setTripCount(TripCount);
+
+ Builder.SetInsertPoint(CLI->getBody(),
+ CLI->getBody()->getFirstInsertionPt());
+
+ llvm::BasicBlock *Body = CLI->getBody();
+ for (llvm::Instruction &I : *Body) {
+ if (auto *Add = llvm::dyn_cast<llvm::BinaryOperator>(&I)) {
+ if (Add->getOpcode() == llvm::Instruction::Add) {
+ if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
+ // update the starting index of the loop
+ Add->setOperand(1, LowerBound);
+ }
+ }
+ }
+ }
+
+ for (Instruction *I : llvm::reverse(ToBeDeleted)) {
+ I->eraseFromParent();
+ }
+ };
+
+ addOutlineInfo(std::move(OI));
+ Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
+ return Builder.saveIP();
+}
+
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c37af8d7b1673..065eece0455e2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -353,6 +353,26 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getDevice())
result = todo("device");
};
+ auto checkFinal = [&todo](auto op, LogicalResult &result) {
+ if (op.getFinal())
+ result = todo("final");
+ };
+ auto checkGrainsize = [&todo](auto op, LogicalResult &result) {
+ if (op.getGrainsize())
+ result = todo("grainsize");
+ };
+ auto checkIf = [](auto op, LogicalResult &) {
+ if (op.getIfExpr())
+ op.emitWarning("if");
+ };
+ auto checkMergeable = [&todo](auto op, LogicalResult &result) {
+ if (op.getMergeable())
+ result = todo("mergeable");
+ };
+ auto checkNogroup = [&todo](auto op, LogicalResult &result) {
+ if (op.getNogroup())
+ result = todo("nogroup");
+ };
auto checkHint = [](auto op, LogicalResult &) {
if (op.getHint())
op.emitWarning("hint clause discarded");
@@ -366,6 +386,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getNowait())
result = todo("nowait");
};
+ auto checkNumTasks = [&todo](auto op, LogicalResult &result) {
+ if (op.getNumTasks())
+ result = todo("num_tasks");
+ };
auto checkOrder = [&todo](auto op, LogicalResult &result) {
if (op.getOrder() || op.getOrderMod())
result = todo("order");
@@ -438,7 +462,15 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkNowait(op, result);
})
.Case([&](omp::TaskloopOp op) {
- // TODO: Add other clauses check
+ checkAllocate(op, result);
+ checkFinal(op, result);
+ checkGrainsize(op, result);
+ checkIf(op, result);
+ checkInReduction(op, result);
+ checkMergeable(op, result);
+ checkNogroup(op, result);
+ checkNumTasks(op, result);
+ checkReduction(op, result);
checkUntied(op, result);
checkPriority(op, result);
})
@@ -2177,6 +2209,8 @@ class TaskContextStructManager {
/// private decls.
void createGEPsToPrivateVars();
+ llvm::Value *isAllocated();
+
/// De-allocate the task context structure.
void freeStructPtr();
@@ -2257,13 +2291,26 @@ void TaskContextStructManager::createGEPsToPrivateVars() {
}
}
+llvm::Value *TaskContextStructManager::isAllocated() {
+ if (!structPtr)
+ return nullptr;
+
+ return builder.CreateIsNotNull(structPtr);
+}
+
void TaskContextStructManager::freeStructPtr() {
if (!structPtr)
return;
llvm::IRBuilderBase::InsertPointGuard guard{builder};
- // Ensure we don't put the call to free() after the terminator
- builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+ llvm::BasicBlock *currentBlock = builder.GetInsertBlock();
+ if (currentBlock->getTerminator()) {
+ // Ensure we don't put the call to free() after the terminator
+ builder.SetInsertPoint(currentBlock->getTerminator());
+ } else {
+ // Insert the call to free() at the end of the current block
+ builder.SetInsertPoint(currentBlock);
+ }
builder.CreateFree(structPtr);
}
@@ -2499,6 +2546,207 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
return success();
}
+// Converts an OpenMP taskloop construct into LLVM IR using OpenMPIRBuilder.
+static LogicalResult
+convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ auto taskloopOp = cast<omp::TaskloopOp>(opInst);
+ if (failed(checkImplementationStatus(opInst)))
+ return failure();
+
+ // It stores the pointer of allocated firstprivate copies,
+ // which can be used later for freeing the allocated space.
+ SmallVector<llvm::Value *> llvmFirstPrivateVars;
+ PrivateVarsInfo privateVarsInfo(taskloopOp);
+ TaskContextStructManager taskStructMgr{builder, moduleTranslation,
+ privateVarsInfo.privatizers};
+
+ llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+ findAllocaInsertPoint(builder, moduleTranslation);
+
+ assert(builder.GetInsertPoint() == builder.GetInsertBlock()->end());
+ llvm::BasicBlock *taskloopStartBlock = llvm::BasicBlock::Create(
+ builder.getContext(), "omp.taskloop.start",
+ /*Parent=*/builder.GetInsertBlock()->getParent());
+ llvm::Instruction *branchToTaskloopStartBlock =
+ builder.CreateBr(taskloopStartBlock);
+ builder.SetInsertPoint(branchToTaskloopStartBlock);
+
+ llvm::BasicBlock *copyBlock =
+ splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
+ llvm::BasicBlock *initBlock =
+ splitBB(builder, /*CreateBranch=*/true, "omp.private.init");
+
+ LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+ moduleTranslation, allocaIP);
+
+ // Allocate and initialize private variables
+ builder.SetInsertPoint(initBlock->getTerminator());
+
+ taskStructMgr.generateTaskContextStruct();
+ taskStructMgr.createGEPsToPrivateVars();
+
+ llvmFirstPrivateVars.resize(privateVarsInfo.blockArgs.size());
+ int index = 0;
+
+ for (auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] :
+ llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.mlirVars,
+ privateVarsInfo.blockArgs,
+ taskStructMgr.getLLVMPrivateVarGEPs())) {
+ // To be handled inside the taskloop.
+ if (!privDecl.readsFromMold())
+ continue;
+ assert(llvmPrivateVarAlloc &&
+ "reads from mold so shouldn't have been skipped");
+
+ llvm::Expected<llvm::Value *> privateVarOrErr =
+ initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
+ blockArg, llvmPrivateVarAlloc, initBlock);
+ if (!privateVarOrErr)
+ return handleError(privateVarOrErr, *taskloopOp.getOperation());
+
+ llvmFirstPrivateVars[index++] = privateVarOrErr.get();
+
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+
+ if ((privateVarOrErr.get() != llvmPrivateVarAlloc) &&
+ !mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ builder.CreateStore(privateVarOrErr.get(), llvmPrivateVarAlloc);
+ // Load it so we have the value pointed to by the GEP
+ llvmPrivateVarAlloc = builder.CreateLoad(privateVarOrErr.get()->getType(),
+ llvmPrivateVarAlloc);
+ }
+ assert(llvmPrivateVarAlloc->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+ }
+
+ // firstprivate copy region
+ setInsertPointForPossiblyEmptyBlock(builder, copyBlock);
+ if (failed(copyFirstPrivateVars(
+ taskloopOp, builder, moduleTranslation, privateVarsInfo.mlirVars,
+ taskStructMgr.getLLVMPrivateVarGEPs(), privateVarsInfo.privatizers,
+ taskloopOp.getPrivateNeedsBarrier())))
+ return llvm::failure();
+
+ // Set up inserttion point for call to createTaskloop()
+ builder.SetInsertPoint(taskloopStartBlock);
+
+ auto bodyCB = [&](InsertPointTy allocaIP,
+ InsertPointTy codegenIP) -> llvm::Error {
+ // Save the alloca insertion point on ModuleTranslation stack for use in
+ // nested regions.
+ LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+ moduleTranslation, allocaIP);
+
+ // translate the body of the taskloop:
+ builder.restoreIP(codegenIP);
+
+ llvm::BasicBlock *privInitBlock = nullptr;
+ privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
+ for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
+ privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
+ privateVarsInfo.mlirVars))) {
+ auto [blockArg, privDecl, mlirPrivVar] = zip;
+ // This is handled before the task executes
+ if (privDecl.readsFromMold())
+ continue;
+
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ llvm::Type *llvmAllocType =
+ moduleTranslation.convertType(privDecl.getType());
+ builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+ llvm::Value *llvmPrivateVar = builder.CreateAlloca(
+ llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
+
+ llvm::Expected<llvm::Value *> privateVarOrError =
+ initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
+ blockArg, llvmPrivateVar, privInitBlock);
+ if (!privateVarOrError)
+ return privateVarOrError.takeError();
+ moduleTranslation.mapValue(blockArg, privateVarOrError.get());
+ privateVarsInfo.llvmVars[i] = privateVarOrError.get();
+ // Add private var to llvmFirstPrivateVars
+ llvmFirstPrivateVars[index++] = privateVarOrError.get();
+ }
+
+ taskStructMgr.createGEPsToPrivateVars();
+ for (auto [i, llvmPrivVar] :
+ llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
+ if (!llvmPrivVar) {
+ assert(privateVarsInfo.llvmVars[i] &&
+ "This is added in the loop above");
+ continue;
+ }
+ privateVarsInfo.llvmVars[i] = llvmPrivVar;
+ }
+
+ // Find and map the addresses of each variable within the taskloop context
+ // structure
+ for (auto [blockArg, llvmPrivateVar, privateDecl] :
+ llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
+ privateVarsInfo.privatizers)) {
+ // This was handled above.
+ if (!privateDecl.readsFromMold())
+ continue;
+ // Fix broken pass-by-value case for Fortran character boxes
+ if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ llvmPrivateVar = builder.CreateLoad(
+ moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
+ }
+ assert(llvmPrivateVar->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+ moduleTranslation.mapValue(blockArg, llvmPrivateVar);
+ }
+
+ auto continuationBlockOrError =
+ convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region",
+ builder, moduleTranslation);
+ ;
+ if (failed(handleError(continuationBlockOrError, opInst)))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
+
+ // dummy check to ensure that the task context structure is accessed inside
+ // the outlined fn.
+ llvm::Value *cond = taskStructMgr.isAllocated();
+ return llvm::Error::success();
+ };
+
+ auto loopOp = cast<omp::LoopNestOp>(taskloopOp.getWrappedLoop());
+
+ auto loopInfo = [&]() -> llvm::Expected<llvm::CanonicalLoopInfo *> {
+ llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation);
+ return loopInfo;
+ };
+
+ llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder();
+ llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
+ moduleTranslation.getOpenMPBuilder()->createTaskloop(
+ ompLoc, allocaIP, bodyCB, loopInfo,
+ moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]),
+ moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]),
+ moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]));
+
+ if (failed(handleError(afterIP, opInst)))
+ return failure();
+
+ builder.restoreIP(*afterIP);
+
+ // freeing the task context structure in exit block of taskloop.
+ if (failed(cleanupPrivateVars(builder, moduleTranslation, taskloopOp.getLoc(),
+ llvmFirstPrivateVars,
+ privateVarsInfo.privatizers)))
+ return failure();
+
+ taskStructMgr.freeStructPtr();
+
+ return success();
+}
+
/// Converts an OpenMP taskgroup construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
@@ -6616,6 +6864,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
.Case([&](omp::TaskOp op) {
return convertOmpTaskOp(op, builder, moduleTranslation);
})
+ .Case([&](omp::TaskloopOp op) {
+ return convertOmpTaskloopOp(*op, builder, moduleTranslation);
+ })
.Case([&](omp::TaskgroupOp op) {
return convertOmpTaskgroupOp(op, builder, moduleTranslation);
})
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
new file mode 100644
index 0000000000000..536a1fe9d9157
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
@@ -0,0 +1,151 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @_QFtestEi_private_i32 : i32
+
+omp.private {type = firstprivate} @_QFtestEa_firstprivate_i32 : i32 copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ %0 = llvm.load %arg0 : !llvm.ptr -> i32
+ llvm.store %0, %arg1 : i32, !llvm.ptr
+ omp.yield(%arg1 : !llvm.ptr)
+}
+
+
+llvm.func @_QPtest() {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+ %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+ %6 = llvm.mlir.constant(20 : i32) : i32
+ llvm.store %6, %3 : i32, !llvm.ptr
+ %7 = llvm.mlir.constant(1 : i32) : i32
+ %8 = llvm.mlir.constant(5 : i32) : i32
+ %9 = llvm.mlir.constant(1 : i32) : i32
+ omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+ omp.loop_nest (%arg2) : i32 = (%7) to (%8) inclusive step (%9) {
+ llvm.store %arg2, %arg1 : i32, !llvm.ptr
+ %10 = llvm.load %arg0 : !llvm.ptr -> i32
+ %11 = llvm.mlir.constant(1 : i32) : i32
+ %12 = llvm.add %10, %11 : i32
+ llvm.store %12, %arg0 : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// CHECK: %struct.kmp_task_info = type { ptr, ptr, i32, ptr, ptr, i64, i64, i64 }
+
+// CHECK-LABEL: define void @_QPtest() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[VAL1:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_X:.*]] = alloca i32, i64 1, align 4
+// CHECK: store i32 20, ptr %[[VAL_X]], align 4
+// CHECK: br label %entry
+
+// CHECK: entry:
+// CHECK: br label %omp.private.init
+
+// CHECK: omp.private.init: ; preds = %entry
+// CHECK: %[[OMP_TASK_CONTEXT_PTR:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
+// CHECK: %[[PRIV_GEP:.*]] = getelementptr { i32 }, ptr %[[OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
+// CHECK: br label %omp.private.copy
+
+// CHECK: omp.private.copy:
+// CHECK: br label %omp.private.copy1
+
+// CHECK: omp.private.copy1:
+// CHECK: %[[LOAD_X:.*]] = load i32, ptr %[[VAL_X]], align 4
+// CHECK: store i32 %[[LOAD_X]], ptr %[[PRIV_GEP]], align 4
+// CHECK: br label %omp.taskloop.start
+
+// CHECK: omp.taskloop.start:
+// CHECK: br label %codeRepl
+
+// CHECK: codeRepl:
+// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK: store ptr %[[OMP_TASK_CONTEXT_PTR]], ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8
+// CHECK: %[[GTID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]])
+// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par)
+// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5
+// CHECK: store i32 1, ptr %[[LB_GEP]], align 4
+// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6
+// CHECK: store i32 5, ptr %[[UB_GEP]], align 4
+// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7
+// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4
+// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4
+// CHECK: %10 = load ptr, ptr %[[TASK_PTR]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %10, ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[GTID]], ptr %[[TASK_PTR]], i32 1, ptr %[[LB_GEP]], ptr %[[UB_GEP]], i64 %[[LOAD_STEP]], i32 1, i32 0, i64 0, ptr null)
+// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[GTID]])
+// CHECK: br label %taskloop.exit
+
+// CHECK: taskloop.exit:
+// CHECK: tail call void @free(ptr %[[OMP_TASK_CONTEXT_PTR]])
+// CHECK: ret void
+// CHECK: }
+
+// CHECK-LABEL: define internal void @_QPtest..omp_par
+// CHECK-SAME: i32 %[[GLOBAL_TID:.*]], ptr %[[TASK_PTR1:.*]]) {
+// CHECK: taskloop.alloca:
+// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8
+// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5
+// CHECK: %[[LB:.*]] = load i32, ptr %[[GEP_LB]], align 4
+// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6
+// CHECK: %[[UB:.*]] = load i32, ptr %[[GEP_UB]], align 4
+// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0
+// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1
+// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4
+// CHECK: br label %taskloop.body
+
+// CHECK: taskloop.body:
+// CHECK: %[[LOAD_X:.*]] = getelementptr { i32 }, ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
+// CHECK: br label %omp.taskloop.region
+
+// CHECK: omp.taskloop.region:
+// CHECK: br label %omp_loop.preheader
+
+// CHECK: omp_loop.preheader:
+// CHECK: %[[VAL2:.*]] = sub i32 %[[UB]], %[[LB]]
+// CHECK: %[[TRIP_CNT:.*]] = add i32 %[[VAL2]], 1
+// CHECK: br label %omp_loop.header
+
+// CHECK: omp_loop.header:
+// CHECK: %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
+// CHECK: br label %omp_loop.cond
+
+// CHECK: omp_loop.cond:
+// CHECK: %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[TRIP_CNT]]
+// CHECK: br i1 %[[OMP_LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit
+
+// CHECK: omp_loop.exit:
+// CHECK: br label %omp_loop.after
+
+// CHECK: omp_loop.after:
+// CHECK: br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK: %[[IS_ALLOCATED:.*]] = icmp ne ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], null
+// CHECK: br label %taskloop.exit.exitStub
+
+// CHECK: omp_loop.body:
+// CHECK: %[[VAL3:.*]] = mul i32 %[[OMP_LOOP_IV]], 1
+// CHECK: %[[VAL5:.*]] = add i32 %[[VAL3]], %[[LB]]
+// CHECK: br label %omp.loop_nest.region
+
+// CHECK: omp.loop_nest.region:
+// CHECK: store i32 %[[VAL5]], ptr %[[OMP_PRIVATE_ALLOC]], align 4
+// CHECK: %[[VAL6:.*]] = load i32, ptr %[[LOAD_X]], align 4
+// CHECK: %[[RES:.*]] = add i32 %[[VAL6]], 1
+// CHECK: store i32 %[[RES]], ptr %[[LOAD_X]], align 4
+// CHECK: br label %omp.region.cont2
+
+// CHECK: omp.region.cont2:
+// CHECK: br label %omp_loop.inc
+
+// CHECK: omp_loop.inc:
+// CHECK: %omp_loop.next = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK: br label %omp_loop.header
+
+// CHECK: taskloop.exit.exitStub:
+// CHECK: ret void
+// CHECK: }
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 396c57af81c44..804398fc75a76 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -323,21 +323,8 @@ llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
// -----
-llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) {
- // expected-error at below {{not yet implemented: omp.taskloop}}
- // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
- omp.taskloop {
- omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
- omp.yield
- }
- }
- llvm.return
-}
-
-// -----
-
llvm.func @taskloop_untied(%lb : i32, %ub : i32, %step : i32) {
- // expected-error at below {{not yet implemented: omp.taskloop}}
+ // expected-error at below {{not yet implemented: Unhandled clause untied in omp.taskloop operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
omp.taskloop untied {
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
>From 28d2db3705980bd908d7a3a12bd6c0e0ba7e4f2f Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <kaviyara2000 at gmail.com>
Date: Mon, 17 Nov 2025 00:27:35 +0530
Subject: [PATCH 02/14] [Flang][OpenMP] Addressed review comments
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 27 +++++++++++--------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 23 +++++++++++++---
mlir/test/Target/LLVMIR/openmp-taskloop.mlir | 10 ++++---
3 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b42d846e3a460..d7769c0f85d31 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2028,7 +2028,7 @@ static Value *emitTaskDependencies(
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB,
- llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> loopInfo,
+ llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) {
if (!updateToLocation(Loc))
@@ -2053,7 +2053,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
return Err;
- llvm::Expected<llvm::CanonicalLoopInfo *> result = loopInfo();
+ llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
if (!result) {
return result.takeError();
}
@@ -2128,12 +2128,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// and set up the lowerbound,upperbound and step values
llvm::Value *lb =
Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5);
- // Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
- Builder.CreateStore(LBVal, lb);
+ Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
+ Builder.CreateStore(LbVal_ext, lb);
llvm::Value *ub =
Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6);
- Builder.CreateStore(UBVal, ub);
+ Value *UbVal_ext = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
+ Builder.CreateStore(UbVal_ext, ub);
llvm::Value *step =
Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7);
@@ -2155,6 +2156,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *NoGroup = Builder.getInt32(1);
Value *Sched = Builder.getInt32(0);
Value *GrainSize = Builder.getInt64(0);
+
+ // TODO: Handle the case when TaskDup pointer isn't empty
Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub,
@@ -2184,13 +2187,15 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Type *IVTy = IV->getType();
Constant *One = ConstantInt::get(IVTy, 1);
- Value *task_lb = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 5, "gep_lb");
- Value *LowerBound = Builder.CreateLoad(IVTy, task_lb, "lb");
+ Value *TaskLB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 5, "gep_lb");
+ Value *LoadTaskLB = Builder.CreateLoad(Builder.getInt64Ty(), TaskLB);
+ Value *LowerBound = Builder.CreateTrunc(LoadTaskLB, IVTy, "lb");
- Value *task_ub = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 6, "gep_ub");
- Value *UpperBound = Builder.CreateLoad(IVTy, task_ub, "ub");
+ Value *TaskUB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 6, "gep_ub");
+ Value *LoadTaskUB = Builder.CreateLoad(Builder.getInt64Ty(), TaskUB);
+ Value *UpperBound = Builder.CreateTrunc(LoadTaskUB, IVTy, "ub");
Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 065eece0455e2..5869b956ca24e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -345,6 +345,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
result = todo("cancel directive inside of taskloop");
}
};
+ auto checkCollapse = [&todo](auto op, LogicalResult &result) {
+ if (op.getCollapseNumLoops() > 1)
+ result = todo("collapse");
+ };
auto checkDepend = [&todo](auto op, LogicalResult &result) {
if (!op.getDependVars().empty() || op.getDependKinds())
result = todo("depend");
@@ -361,9 +365,9 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getGrainsize())
result = todo("grainsize");
};
- auto checkIf = [](auto op, LogicalResult &) {
+ auto checkIf = [&todo](auto op, LogicalResult &result) {
if (op.getIfExpr())
- op.emitWarning("if");
+ result = todo("if");
};
auto checkMergeable = [&todo](auto op, LogicalResult &result) {
if (op.getMergeable())
@@ -435,6 +439,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkAllocate(op, result);
checkOrder(op, result);
})
+ .Case([&](omp::LoopNestOp op) {
+ if (mlir::isa<omp::TaskloopOp>(op.getOperation()->getParentOp()))
+ checkCollapse(op, result);
+ })
.Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
.Case([&](omp::SectionsOp op) {
checkAllocate(op, result);
@@ -2711,7 +2719,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// dummy check to ensure that the task context structure is accessed inside
// the outlined fn.
- llvm::Value *cond = taskStructMgr.isAllocated();
+ [[maybe_unused]] llvm::Value *cond = taskStructMgr.isAllocated();
return llvm::Error::success();
};
@@ -2722,7 +2730,6 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return loopInfo;
};
- llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder();
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
moduleTranslation.getOpenMPBuilder()->createTaskloop(
@@ -2742,6 +2749,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
privateVarsInfo.privatizers)))
return failure();
+ // Note: This free is valid because end_taskgroup waits until all generated
+ // tasks are complete before returning. In the presence of Nogroup clause,
+ // @__kmpc_taskgroup(..)/@__kmpc_end_taskgroup(..) is not called, have to
+ // ensure that this freeStructPtr() is not called until every thread has
+ // completed execution
taskStructMgr.freeStructPtr();
return success();
@@ -3398,6 +3410,9 @@ convertOmpLoopNest(Operation &opInst, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
auto loopOp = cast<omp::LoopNestOp>(opInst);
+ if (failed(checkImplementationStatus(opInst)))
+ return failure();
+
// Set up the source location value for OpenMP runtime.
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
index 536a1fe9d9157..8179784a47d90 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
@@ -67,9 +67,9 @@ llvm.func @_QPtest() {
// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]])
// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par)
// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5
-// CHECK: store i32 1, ptr %[[LB_GEP]], align 4
+// CHECK: store i64 1, ptr %[[LB_GEP]], align 4
// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6
-// CHECK: store i32 5, ptr %[[UB_GEP]], align 4
+// CHECK: store i64 5, ptr %[[UB_GEP]], align 4
// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7
// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4
// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4
@@ -89,9 +89,11 @@ llvm.func @_QPtest() {
// CHECK: taskloop.alloca:
// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8
// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5
-// CHECK: %[[LB:.*]] = load i32, ptr %[[GEP_LB]], align 4
+// CHECK: %[[LOAD_LB64:.*]] = load i64, ptr %[[GEP_LB]], align 4
+// CHECK: %[[LB:.*]] = trunc i64 %[[LOAD_LB64]] to i32
// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6
-// CHECK: %[[UB:.*]] = load i32, ptr %[[GEP_UB]], align 4
+// CHECK: %[[LOAD_UB64:.*]] = load i64, ptr %[[GEP_UB]], align 4
+// CHECK: %[[UB:.*]] = trunc i64 %[[LOAD_UB64]] to i32
// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0
// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1
// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4
>From 2ba219968a43a0898bbce1b5f258e4ed9ec3af6e Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <kaviyara2000 at gmail.com>
Date: Mon, 17 Nov 2025 14:14:57 +0530
Subject: [PATCH 03/14] [Flang][OpenMP]Added TODO testcases for taskloop
clauses
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 2 +-
mlir/test/Target/LLVMIR/openmp-todo.mlir | 161 ++++++++++++++++++
2 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 5869b956ca24e..71ace3ff45052 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -411,7 +411,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
result = todo("privatization");
};
auto checkReduction = [&todo](auto op, LogicalResult &result) {
- if (isa<omp::TeamsOp>(op))
+ if (isa<omp::TeamsOp>(op) || isa<omp::TaskloopOp>(op))
if (!op.getReductionVars().empty() || op.getReductionByref() ||
op.getReductionSyms())
result = todo("reduction");
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 804398fc75a76..04120166622ee 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -320,6 +320,167 @@ llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
}
llvm.return
}
+// -----
+
+llvm.func @taskloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause allocate in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_collapse(%lb : i32, %ub : i32, %step : i32, %lb1 : i32, %ub1 : i32, %step1 : i32) {
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop {
+ // expected-error at below {{not yet implemented: Unhandled clause collapse in omp.loop_nest operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.loop_nest}}
+ omp.loop_nest (%iv, %iv1) : i32 = (%lb, %lb1) to (%ub, %ub1) inclusive step (%step, %step1) collapse(2) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_final(%lb : i32, %ub : i32, %step : i32, %true : i1) {
+ // expected-error at below {{not yet implemented: Unhandled clause final in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop final(%true) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_grainsize(%lb : i32, %ub : i32, %step : i32, %grainsize : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause grainsize in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop grainsize(%grainsize: i32) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_if(%lb : i32, %ub : i32, %step : i32, %true : i1) {
+ // expected-error at below {{not yet implemented: Unhandled clause if in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop if(%true) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+ omp.declare_reduction @add_reduction_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+ }combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ }
+
+llvm.func @taskloop_inreduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause in_reduction in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop in_reduction(@add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_mergeable(%lb : i32, %ub : i32, %step : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause mergeable in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop mergeable {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_nogroup(%lb : i32, %ub : i32, %step : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause nogroup in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop nogroup {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_num_tasks(%lb : i32, %ub : i32, %step : i32, %numtasks : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause num_tasks in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop num_tasks(%numtasks: i32) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_priority(%lb : i32, %ub : i32, %step : i32, %priority : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause priority in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop priority(%priority: i32) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+ omp.declare_reduction @add_reduction_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+ }combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ }
+
+llvm.func @taskloop_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause reduction in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop reduction(@add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
// -----
>From baff552e6de35e5b84a4efe586e119b4cab37eea Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Tue, 2 Dec 2025 08:47:54 +0000
Subject: [PATCH 04/14] [Flang][OpenMP] Rework bounds in Taskloop lowering
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 137 +++++++++++++---------
1 file changed, 79 insertions(+), 58 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d7769c0f85d31..39adde453b1eb 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2065,10 +2065,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
OI.ExitBB = TaskloopExitBB;
// Add the thread ID argument.
- SmallVector<Instruction *, 4> ToBeDeleted;
+ SmallVector<Instruction *> ToBeDeleted;
// dummy instruction to be used as a fake argument
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
+ createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
+ "global.lb", false);
+ createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
+ "global.ub", false);
+ createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
+ "global.step", false);
OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
TaskloopAllocaBB, CLI, Loc,
@@ -2078,9 +2084,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- // HasShareds is true if any variables are captured in the outlined region,
- // false otherwise.
- bool HasShareds = StaleCI->arg_size() > 1;
Builder.SetInsertPoint(StaleCI);
// Gather the arguments for emitting the runtime call for
@@ -2101,20 +2104,17 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *TaskSize = Builder.getInt64(
divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8));
- Value *SharedsSize = Builder.getInt64(0);
- if (HasShareds) {
- AllocaInst *ArgStructAlloca =
- dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
- assert(ArgStructAlloca &&
- "Unable to find the alloca instruction corresponding to arguments "
- "for extracted function");
- StructType *ArgStructType =
- dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
- assert(ArgStructType && "Unable to find struct type corresponding to "
- "arguments for extracted function");
- SharedsSize =
- Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
- }
+ AllocaInst *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ assert(ArgStructAlloca &&
+ "Unable to find the alloca instruction corresponding to arguments "
+ "for extracted function");
+ StructType *ArgStructType =
+ dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+ assert(ArgStructType && "Unable to find struct type corresponding to "
+ "arguments for extracted function");
+ Value *SharedsSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
// Emit the @__kmpc_omp_task_alloc runtime call
// The runtime call returns a pointer to an area where the task captured
@@ -2124,31 +2124,25 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
/*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
/*task_func=*/&OutlinedFn});
+ Value *Shareds = StaleCI->getArgOperand(1);
+ Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+ Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+ Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+ SharedsSize);
// Get the pointer to loop lb, ub, step from task ptr
// and set up the lowerbound,upperbound and step values
- llvm::Value *lb =
- Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5);
- Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
- Builder.CreateStore(LbVal_ext, lb);
-
- llvm::Value *ub =
- Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6);
- Value *UbVal_ext = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
- Builder.CreateStore(UbVal_ext, ub);
-
- llvm::Value *step =
- Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7);
- Value *Step_ext = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
- Builder.CreateStore(Step_ext, step);
- llvm::Value *loadstep = Builder.CreateLoad(Builder.getInt64Ty(), step);
+ llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0);
+ Value *LbValExt = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
+ Builder.CreateStore(LbValExt, Lb);
- if (HasShareds) {
- Value *Shareds = StaleCI->getArgOperand(1);
- Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
- Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
- Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
- SharedsSize);
- }
+ llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1);
+ Value *UbValExt = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
+ Builder.CreateStore(UbValExt, Ub);
+
+ llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2);
+ Value *StepExt = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
+ Builder.CreateStore(StepExt, Step);
+ llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
// set up the arguments for emitting kmpc_taskloop runtime call
// setting default values for ifval, nogroup, sched, grainsize, task_dup
@@ -2160,8 +2154,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// TODO: Handle the case when TaskDup pointer isn't empty
Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
- Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub,
- loadstep, NoGroup, Sched, GrainSize, TaskDup};
+ Value *Args[] = {Ident, ThreadID, TaskData, IfVal, Lb, Ub,
+ Loadstep, NoGroup, Sched, GrainSize, TaskDup};
// taskloop runtime call
Function *TaskloopFn =
@@ -2177,29 +2171,53 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
- if (HasShareds) {
- LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
- OutlinedFn.getArg(1)->replaceUsesWithIf(
- Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
- }
+ LoadInst *SharedsOutlined =
+ Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
+ OutlinedFn.getArg(1)->replaceUsesWithIf(
+ SharedsOutlined,
+ [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
Value *IV = CLI->getIndVar();
Type *IVTy = IV->getType();
Constant *One = ConstantInt::get(IVTy, 1);
- Value *TaskLB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 5, "gep_lb");
- Value *LoadTaskLB = Builder.CreateLoad(Builder.getInt64Ty(), TaskLB);
- Value *LowerBound = Builder.CreateTrunc(LoadTaskLB, IVTy, "lb");
-
- Value *TaskUB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 6, "gep_ub");
- Value *LoadTaskUB = Builder.CreateLoad(Builder.getInt64Ty(), TaskUB);
- Value *UpperBound = Builder.CreateTrunc(LoadTaskUB, IVTy, "ub");
+ // When outlining, CodeExtractor will create GEP's to the LowerBound and
+ // UpperBound. These GEP's can be reused for loading the tasks respective
+ // bounds.
+ Value *TaskLB = nullptr;
+ Value *TaskUB = nullptr;
+ Value *LoadTaskLB = nullptr;
+ Value *LoadTaskUB = nullptr;
+ for (Instruction &I : *TaskloopAllocaBB) {
+ if (I.getOpcode() == Instruction::GetElementPtr) {
+ GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
+ switch (CI->getZExtValue()) {
+ case 0:
+ TaskLB = &I;
+ break;
+ case 1:
+ TaskUB = &I;
+ break;
+ }
+ }
+ } else if (I.getOpcode() == Instruction::Load) {
+ LoadInst &Load = cast<LoadInst>(I);
+ if (Load.getPointerOperand() == TaskLB) {
+ assert(TaskLB != nullptr && "Expected value for TaskLB");
+ LoadTaskLB = &I;
+ } else if (Load.getPointerOperand() == TaskUB) {
+ assert(TaskUB != nullptr && "Expected value for TaskUB");
+ LoadTaskUB = &I;
+ }
+ }
+ }
Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
- Value *TripCountMinusOne = Builder.CreateSub(UpperBound, LowerBound);
+ assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
+ assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
+ Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
// set the trip count in the CLI
CLI->setTripCount(TripCount);
@@ -2213,13 +2231,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
if (Add->getOpcode() == llvm::Instruction::Add) {
if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
// update the starting index of the loop
- Add->setOperand(1, LowerBound);
+ Add->setOperand(1, LoadTaskLB);
}
}
}
}
for (Instruction *I : llvm::reverse(ToBeDeleted)) {
+ while (!I->use_empty()) {
+ I->user_back()->eraseFromParent();
+ }
I->eraseFromParent();
}
};
>From af965a4f03dd0e04fc25ab522ee8e13053af9181 Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 17 Dec 2025 11:42:46 +0000
Subject: [PATCH 05/14] Updates to bounds rework
- Force the first 3 entries to the StructArg to be the bounds info
- Ensure it will work when executing the tasks in parallel
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +
.../llvm/Transforms/Utils/CodeExtractor.h | 4 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 76 ++++++++++++-------
3 files changed, 51 insertions(+), 31 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 72c23cf263c9c..b4dfb5ae20d52 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -14,6 +14,7 @@
#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
+#include "llvm/ADT/SetVector.h"
#include "llvm/Frontend/Atomic/Atomic.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
@@ -2364,6 +2365,7 @@ class OpenMPIRBuilder {
PostOutlineCBTy PostOutlineCB;
BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
SmallVector<Value *, 2> ExcludeArgsFromAggregate;
+ SetVector<Value *> Inputs, Outputs;
// TODO: this should be safe to enable by default
bool FixUpNonEntryAllocas = false;
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 407eb50d2c7a3..3e2c69b47bc48 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -171,9 +171,9 @@ class CodeExtractorAnalysisCache {
///
/// \param CEAC - Cache to speed up operations for the CodeExtractor when
/// hoisting, and extracting lifetime values and assumes.
- /// \param Inputs [out] - filled with values marked as inputs to the
+ /// \param Inputs [in/out] - filled with values marked as inputs to the
/// newly outlined function.
- /// \param Outputs [out] - filled with values marked as outputs to the
+ /// \param Outputs [out] - filled with values marked as outputs to the
/// newly outlined function.
/// \returns zero when called on a CodeExtractor instance where isEligible
/// returns false.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 39adde453b1eb..dce6c1aa7ee5f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -403,18 +403,19 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
llvm::SmallVectorImpl<Instruction *> &ToBeDeleted,
OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
- const Twine &Name = "", bool AsPtr = true) {
+ const Twine &Name = "", bool AsPtr = true,
+ bool Is64Bit = false) {
Builder.restoreIP(OuterAllocaIP);
+ IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
Instruction *FakeVal;
AllocaInst *FakeValAddr =
- Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
+ Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
ToBeDeleted.push_back(FakeValAddr);
if (AsPtr) {
FakeVal = FakeValAddr;
} else {
- FakeVal =
- Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
+ FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
ToBeDeleted.push_back(FakeVal);
}
@@ -422,11 +423,10 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
Builder.restoreIP(InnerAllocaIP);
Instruction *UseFakeVal;
if (AsPtr) {
- UseFakeVal =
- Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
+ UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
} else {
- UseFakeVal =
- cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
+ UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
+ FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
}
ToBeDeleted.push_back(UseFakeVal);
return FakeVal;
@@ -830,7 +830,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
for (auto *V : OI.ExcludeArgsFromAggregate)
Extractor.excludeArgFromAggregate(V);
- Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
+ Function *OutlinedFn =
+ Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
// Forward target-cpu, target-features attributes to the outlined function.
auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
@@ -2069,21 +2070,39 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// dummy instruction to be used as a fake argument
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
- createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
- "global.lb", false);
- createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
- "global.ub", false);
- createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
- "global.step", false);
+ Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
+ TaskloopAllocaIP, "lb", false, true);
+ Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
+ TaskloopAllocaIP, "ub", false, true);
+ Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
+ TaskloopAllocaIP, "step", false, true);
+ /* For Taskloop, we want to force the bounds being the first 3 inputs in the
+ * aggregate struct*/
+ OI.Inputs.insert(FakeLB);
+ OI.Inputs.insert(FakeUB);
+ OI.Inputs.insert(FakeStep);
OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
- TaskloopAllocaBB, CLI, Loc,
- ToBeDeleted](Function &OutlinedFn) mutable {
+ TaskloopAllocaBB, CLI, Loc, ToBeDeleted, FakeLB, FakeUB,
+ FakeStep](Function &OutlinedFn) mutable {
// Replace the Stale CI by appropriate RTL function call.
assert(OutlinedFn.hasOneUse() &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+ /* Create the casting for the Bounds Values that can be used when outlining
+ * to replace the uses of the fakes with real values */
+ BasicBlock *CodeReplBB = StaleCI->getParent();
+ IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
+ Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
+ Value *CastedLBVal =
+ Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
+ Value *CastedUBVal =
+ Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
+ Value *CastedStepVal =
+ Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
+ Builder.restoreIP(CurrentIp);
+
Builder.SetInsertPoint(StaleCI);
// Gather the arguments for emitting the runtime call for
@@ -2132,16 +2151,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// Get the pointer to loop lb, ub, step from task ptr
// and set up the lowerbound,upperbound and step values
llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0);
- Value *LbValExt = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
- Builder.CreateStore(LbValExt, Lb);
+ Builder.CreateStore(CastedLBVal, Lb);
llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1);
- Value *UbValExt = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
- Builder.CreateStore(UbValExt, Ub);
+ Builder.CreateStore(CastedUBVal, Ub);
llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2);
- Value *StepExt = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
- Builder.CreateStore(StepExt, Step);
+ Builder.CreateStore(CastedStepVal, Step);
llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
// set up the arguments for emitting kmpc_taskloop runtime call
@@ -2179,7 +2195,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *IV = CLI->getIndVar();
Type *IVTy = IV->getType();
- Constant *One = ConstantInt::get(IVTy, 1);
+ Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
// When outlining, CodeExtractor will create GEP's to the LowerBound and
// UpperBound. These GEP's can be reused for loading the tasks respective
@@ -2219,8 +2235,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
+ Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
+ Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
// set the trip count in the CLI
- CLI->setTripCount(TripCount);
+ CLI->setTripCount(CastedTripCount);
Builder.SetInsertPoint(CLI->getBody(),
CLI->getBody()->getFirstInsertionPt());
@@ -2231,16 +2249,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
if (Add->getOpcode() == llvm::Instruction::Add) {
if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
// update the starting index of the loop
- Add->setOperand(1, LoadTaskLB);
+ Add->setOperand(1, CastedTaskLB);
}
}
}
}
+ FakeLB->replaceAllUsesWith(CastedLBVal);
+ FakeUB->replaceAllUsesWith(CastedUBVal);
+ FakeStep->replaceAllUsesWith(CastedStepVal);
for (Instruction *I : llvm::reverse(ToBeDeleted)) {
- while (!I->use_empty()) {
- I->user_back()->eraseFromParent();
- }
I->eraseFromParent();
}
};
>From 770c50dc960b57d79367c9a315de2e2463605d74 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Fri, 2 Jan 2026 15:47:52 +0000
Subject: [PATCH 06/14] kaviya's review comments
Comments at https://github.com/Stylie777/llvm-project/pull/3
---
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 1 -
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 50 +++++++++++++------
2 files changed, 34 insertions(+), 17 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index bb12c1558766b..152a8f727310a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,7 +95,6 @@ __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidP
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
-__OMP_STRUCT_TYPE(Taskloop, kmp_task_info, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr, Int64, Int64, Int64)
__OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
Int8, Int8, Int8, Int32, Int32, Int32, Int32, Int32, Int32)
__OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index dce6c1aa7ee5f..ff8c9200aa5db 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2076,8 +2076,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
TaskloopAllocaIP, "ub", false, true);
Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
TaskloopAllocaIP, "step", false, true);
- /* For Taskloop, we want to force the bounds being the first 3 inputs in the
- * aggregate struct*/
+ // For Taskloop, we want to force the bounds being the first 3 inputs in the
+ // aggregate struct
OI.Inputs.insert(FakeLB);
OI.Inputs.insert(FakeUB);
OI.Inputs.insert(FakeStep);
@@ -2121,7 +2121,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *Flags = Builder.getInt32(Tied);
Value *TaskSize = Builder.getInt64(
- divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8));
+ divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
AllocaInst *ArgStructAlloca =
dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
@@ -2150,14 +2150,14 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
SharedsSize);
// Get the pointer to loop lb, ub, step from task ptr
// and set up the lowerbound,upperbound and step values
- llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0);
- Builder.CreateStore(CastedLBVal, Lb);
+ llvm::Value *Lb = Builder.CreateGEP(
+ ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
- llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1);
- Builder.CreateStore(CastedUBVal, Ub);
+ llvm::Value *Ub = Builder.CreateGEP(
+ ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
- llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2);
- Builder.CreateStore(CastedStepVal, Step);
+ llvm::Value *Step = Builder.CreateGEP(
+ ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
// set up the arguments for emitting kmpc_taskloop runtime call
@@ -2243,13 +2243,31 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Builder.SetInsertPoint(CLI->getBody(),
CLI->getBody()->getFirstInsertionPt());
- llvm::BasicBlock *Body = CLI->getBody();
- for (llvm::Instruction &I : *Body) {
- if (auto *Add = llvm::dyn_cast<llvm::BinaryOperator>(&I)) {
- if (Add->getOpcode() == llvm::Instruction::Add) {
- if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
- // update the starting index of the loop
- Add->setOperand(1, CastedTaskLB);
+ // The canonical loop is generated with a fixed lower bound. We need to
+ // update the index calculation code to use the task's lower bound. The
+ // generated code looks like this:
+ // %omp_loop.iv = phi ...
+ // ...
+ // %tmp = mul [type] %omp_loop.iv, step
+ // %user_index = add [type] tmp, lb
+ // OpenMPIRBuilder constructs canonical loops to have exactly three uses of
+ // the normalised induction variable:
+ // 1. This one: converting the normalised IV to the user IV
+ // 2. The increment (add)
+ // 3. The comparison against the trip count (icmp)
+ // (1) is the only use that is a mul followed by an add so this cannot match
+ // other IR.
+ assert(CLI->getIndVar()->getNumUses() == 3 &&
+ "Canonical loop should have exactly three uses of the ind var");
+ for (User *IVUser : CLI->getIndVar()->users()) {
+ if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
+ if (Mul->getOpcode() == Instruction::Mul) {
+ for (User *MulUser : Mul->users()) {
+ if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
+ if (Add->getOpcode() == Instruction::Add) {
+ Add->setOperand(1, CastedTaskLB);
+ }
+ }
}
}
}
>From bbc399074575e80403d47b225ce2ecdef0c96fe3 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Mon, 5 Jan 2026 10:10:29 +0000
Subject: [PATCH 07/14] [NFC] Refine private var init/copy interfaces
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 60 ++++++++++++++-----
1 file changed, 44 insertions(+), 16 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 71ace3ff45052..6591de631813e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1630,18 +1630,17 @@ findAssociatedValue(Value privateVar, llvm::IRBuilderBase &builder,
/// allocateAndInitPrivateVars instead of this.
/// This returns the private variable which has been initialized. This
/// variable should be mapped before constructing the body of the Op.
-static llvm::Expected<llvm::Value *> initPrivateVar(
- llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
- omp::PrivateClauseOp &privDecl, Value mlirPrivVar, BlockArgument &blockArg,
- llvm::Value *llvmPrivateVar, llvm::BasicBlock *privInitBlock,
- llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+static llvm::Expected<llvm::Value *>
+initPrivateVar(llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ omp::PrivateClauseOp &privDecl, llvm::Value *nonPrivateVar,
+ BlockArgument &blockArg, llvm::Value *llvmPrivateVar,
+ llvm::BasicBlock *privInitBlock,
+ llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
Region &initRegion = privDecl.getInitRegion();
if (initRegion.empty())
return llvmPrivateVar;
- // map initialization region block arguments
- llvm::Value *nonPrivateVar = findAssociatedValue(
- mlirPrivVar, builder, moduleTranslation, mappedPrivateVars);
assert(nonPrivateVar);
moduleTranslation.mapValue(privDecl.getInitMoldArg(), nonPrivateVar);
moduleTranslation.mapValue(privDecl.getInitPrivateArg(), llvmPrivateVar);
@@ -1666,6 +1665,19 @@ static llvm::Expected<llvm::Value *> initPrivateVar(
return phis[0];
}
+/// Version of initPrivateVar which looks up the nonPrivateVar from mlirPrivVar.
+static llvm::Expected<llvm::Value *> initPrivateVar(
+ llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
+ omp::PrivateClauseOp &privDecl, Value mlirPrivVar, BlockArgument &blockArg,
+ llvm::Value *llvmPrivateVar, llvm::BasicBlock *privInitBlock,
+ llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+ return initPrivateVar(
+ builder, moduleTranslation, privDecl,
+ findAssociatedValue(mlirPrivVar, builder, moduleTranslation,
+ mappedPrivateVars),
+ blockArg, llvmPrivateVar, privInitBlock, mappedPrivateVars);
+}
+
static llvm::Error
initPrivateVars(llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
@@ -1751,7 +1763,7 @@ allocatePrivateVars(llvm::IRBuilderBase &builder,
static LogicalResult copyFirstPrivateVars(
mlir::Operation *op, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
- SmallVectorImpl<mlir::Value> &mlirPrivateVars,
+ SmallVectorImpl<llvm::Value *> &moldVars,
ArrayRef<llvm::Value *> llvmPrivateVars,
SmallVectorImpl<omp::PrivateClauseOp> &privateDecls, bool insertBarrier,
llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
@@ -1769,19 +1781,15 @@ static LogicalResult copyFirstPrivateVars(
splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
setInsertPointForPossiblyEmptyBlock(builder, copyBlock);
- for (auto [decl, mlirVar, llvmVar] :
- llvm::zip_equal(privateDecls, mlirPrivateVars, llvmPrivateVars)) {
+ for (auto [decl, moldVar, llvmVar] :
+ llvm::zip_equal(privateDecls, moldVars, llvmPrivateVars)) {
if (decl.getDataSharingType() != omp::DataSharingClauseType::FirstPrivate)
continue;
// copyRegion implements `lhs = rhs`
Region ©Region = decl.getCopyRegion();
- // map copyRegion rhs arg
- llvm::Value *nonPrivateVar = findAssociatedValue(
- mlirVar, builder, moduleTranslation, mappedPrivateVars);
- assert(nonPrivateVar);
- moduleTranslation.mapValue(decl.getCopyMoldArg(), nonPrivateVar);
+ moduleTranslation.mapValue(decl.getCopyMoldArg(), moldVar);
// map copyRegion lhs arg
moduleTranslation.mapValue(decl.getCopyPrivateArg(), llvmVar);
@@ -1812,6 +1820,26 @@ static LogicalResult copyFirstPrivateVars(
return success();
}
+static LogicalResult copyFirstPrivateVars(
+ mlir::Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ SmallVectorImpl<mlir::Value> &mlirPrivateVars,
+ ArrayRef<llvm::Value *> llvmPrivateVars,
+ SmallVectorImpl<omp::PrivateClauseOp> &privateDecls, bool insertBarrier,
+ llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+ llvm::SmallVector<llvm::Value *> moldVars(mlirPrivateVars.size());
+ llvm::transform(mlirPrivateVars, moldVars.begin(), [&](mlir::Value mlirVar) {
+ // map copyRegion rhs arg
+ llvm::Value *moldVar = findAssociatedValue(
+ mlirVar, builder, moduleTranslation, mappedPrivateVars);
+ assert(moldVar);
+ return moldVar;
+ });
+ return copyFirstPrivateVars(op, builder, moduleTranslation, moldVars,
+ llvmPrivateVars, privateDecls, insertBarrier,
+ mappedPrivateVars);
+}
+
static LogicalResult
cleanupPrivateVars(llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation, Location loc,
>From c4b361a76a5145e45ae4b42da6c1746110ef6292 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Mon, 5 Jan 2026 10:10:51 +0000
Subject: [PATCH 08/14] Task duplication function generation
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 61 +++++++++-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 100 ++++++++++++++-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 115 ++++++++++++++++--
3 files changed, 258 insertions(+), 18 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b4dfb5ae20d52..7ba7a05ac71a3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -647,6 +647,38 @@ class OpenMPIRBuilder {
using BodyGenCallbackTy =
function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+ /// Callback type for task duplication function code generation. This is the
+ /// task duplication function passed to __kmpc_taskloop. It is expected that
+ /// this function will set up (first)private variables in the duplicated task
+ /// which have non-trivial (copy-)constructors. Insertion points are handled
+ /// the same way as for BodyGenCallbackTy.
+ ///
+ /// \ref createTaskloop lays out the task's auxiliary data structure as:
+ /// `{ lower bound, upper bound, step, data... }`. DestPtr and SrcPtr point
+ /// to this data.
+ ///
+ /// It is acceptable for the callback to be set to nullptr. In that case no
+ /// function will be generated and nullptr will be passed as the task
+ /// duplication function to __kmpc_taskloop.
+ ///
+ /// \param AllocaIP is the insertion point at which new alloca instructions
+ /// should be placed. The BasicBlock it is pointing to must
+ /// not be split.
+ /// \param CodeGenIP is the insertion point at which the body code should be
+ /// placed.
+ /// \param DestPtr This is a pointer to data inside the newly duplicated
+ /// task's auxiliary data structure (allocated after the task
+ /// descriptor.)
+ /// \param SrcPtr This is a pointer to data inside the original task's
+ /// auxiliary data structure (allocated after the task
+ /// descriptor.)
+ ///
+ /// \return The insertion point immediately after the generated code, or an
+ /// error if any occured.
+ using TaskDupCallbackTy = function_ref<Expected<InsertPointTy>(
+ InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr,
+ Value *SrcPtr)>;
+
// This is created primarily for sections construct as llvm::function_ref
// (BodyGenCallbackTy) is not storable (as described in the comments of
// function_ref class - function_ref contains non-ownable reference
@@ -1217,6 +1249,26 @@ class OpenMPIRBuilder {
LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
const Twine &NamePrefix = "");
+ /// Creates a task duplication function to be passed to kmpc_taskloop.
+ ///
+ /// The OpenMP runtime defines this function as taking the destination
+ /// kmp_task_t, source kmp_task_t, and a lastprivate flag. This function is
+ /// called on the source and destination tasks after the source task has been
+ /// duplicated to create the destination task. At this point the destination
+ /// task has been otherwise set up from the runtime's perspective, but this
+ /// function is needed to fix up any data for the duplicated task e.g. private
+ /// variables with non-trivial constructors.
+ ///
+ /// \param PrivatesTy The type of the privates structure for the task.
+ /// \param PrivatesIndex The index inside the privates structure containing
+ /// the data for the callback.
+ /// \param DupCB The callback to generate the duplication code. See
+ /// documentation for \ref TaskDupCallbackTy. This can be
+ /// nullptr.
+ Expected<Value *> createTaskDuplicationFunction(Type *PrivatesTy,
+ int32_t PrivatesIndex,
+ TaskDupCallbackTy DupCB);
+
public:
/// Modifies the canonical loop to be a workshare loop.
///
@@ -1413,11 +1465,18 @@ class OpenMPIRBuilder {
/// \param UBVal Upperbound value of loop
/// \param StepVal Step value of loop
/// \param Tied True if the task is tied, false if the task is untied.
+ /// \param DupCB The callback to generate the duplication code. See
+ /// documentation for \ref TaskDupCallbackTy. This can be nullptr.
+ /// \param TaskContextStructPtrVal If non-null, a pointer to to be placed
+ /// immediately after the {lower bound, upper
+ /// bound, step} values in the task data.
LLVM_ABI InsertPointOrErrorTy createTaskloop(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB,
llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
- Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true);
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true,
+ TaskDupCallbackTy DupCB = nullptr,
+ Value *TaskContextStructPtrVal = nullptr);
/// Generator for `#omp task`
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ff8c9200aa5db..e32e92721d4d6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -50,6 +50,7 @@
#include "llvm/IR/Value.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/VirtualFileSystem.h"
@@ -2026,11 +2027,77 @@ static Value *emitTaskDependencies(
return DepArray;
}
+/// Create the task duplication function passed to kmpc_taskloop.
+Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
+ Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
+ unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
+ if (!DupCB)
+ return Constant::getNullValue(
+ PointerType::get(Builder.getContext(), ProgramAddressSpace));
+
+ // From OpenMP Runtime p_task_dup_t:
+ // Routine optionally generated by the compiler for setting the lastprivate
+ // flag and calling needed constructors for private/firstprivate objects (used
+ // to form taskloop tasks from pattern task) Parameters: dest task, src task,
+ // lastprivate flag.
+ // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+ auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
+
+ FunctionType *DupFuncTy = FunctionType::get(
+ Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
+ /*isVarArg=*/false);
+
+ Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
+ "omp_taskloop_dup", M);
+ Value *DestTaskArg = DupFunction->getArg(0);
+ Value *SrcTaskArg = DupFunction->getArg(1);
+ Value *LastprivateFlagArg = DupFunction->getArg(2);
+ DestTaskArg->setName("dest_task");
+ SrcTaskArg->setName("src_task");
+ LastprivateFlagArg->setName("lastprivate_flag");
+
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(
+ BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
+
+ auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
+ Type *TaskWithPrivatesTy =
+ StructType::get(Builder.getContext(), {Task, PrivatesTy});
+ Value *TaskPrivates = Builder.CreateGEP(
+ TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
+ Value *ContextPtr = Builder.CreateGEP(
+ PrivatesTy, TaskPrivates,
+ {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
+ return ContextPtr;
+ };
+
+ Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
+ Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
+
+ DestTaskContextPtr->setName("destPtr");
+ SrcTaskContextPtr->setName("srcPtr");
+
+ InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
+ DupFunction->getEntryBlock().begin());
+ InsertPointTy CodeGenIP = Builder.saveIP();
+ Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
+ DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
+ if (!AfterIPOrError)
+ return AfterIPOrError.takeError();
+ Builder.restoreIP(*AfterIPOrError);
+
+ Builder.CreateRetVoid();
+
+ return DupFunction;
+}
+
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB,
llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
- Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) {
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied,
+ TaskDupCallbackTy DupCB, Value *TaskContextStructPtrVal) {
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -2081,10 +2148,33 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
OI.Inputs.insert(FakeLB);
OI.Inputs.insert(FakeUB);
OI.Inputs.insert(FakeStep);
+ if (TaskContextStructPtrVal)
+ OI.Inputs.insert(TaskContextStructPtrVal);
+ assert(
+ (TaskContextStructPtrVal && DupCB) ||
+ (!TaskContextStructPtrVal && !DupCB) &&
+ "Task context struct ptr and duplication callback must be both set "
+ "or both null");
+
+ // It isn't safe to run the duplication bodygen callback inside the post
+ // outlining callback so this has to be run now before we know the real task
+ // shareds structure type.
+ unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
+ Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
+ Type *FakeSharedsTy = StructType::get(
+ Builder.getContext(),
+ {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
+ Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
+ FakeSharedsTy,
+ /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
+ if (!TaskDupFnOrErr) {
+ return TaskDupFnOrErr.takeError();
+ }
+ Value *TaskDupFn = *TaskDupFnOrErr;
OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
- TaskloopAllocaBB, CLI, Loc, ToBeDeleted, FakeLB, FakeUB,
- FakeStep](Function &OutlinedFn) mutable {
+ TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
+ FakeLB, FakeUB, FakeStep](Function &OutlinedFn) mutable {
// Replace the Stale CI by appropriate RTL function call.
assert(OutlinedFn.hasOneUse() &&
"there must be a single user for the outlined function");
@@ -2166,9 +2256,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *NoGroup = Builder.getInt32(1);
Value *Sched = Builder.getInt32(0);
Value *GrainSize = Builder.getInt64(0);
-
- // TODO: Handle the case when TaskDup pointer isn't empty
- Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
+ Value *TaskDup = TaskDupFn;
Value *Args[] = {Ident, ThreadID, TaskData, IfVal, Lb, Ub,
Loadstep, NoGroup, Sched, GrainSize, TaskDup};
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6591de631813e..dc0462e88093b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2245,6 +2245,13 @@ class TaskContextStructManager {
/// private decls.
void createGEPsToPrivateVars();
+ /// Given the address of the structure, return a GEP for each private variable
+ /// in the structure. Null values are added where private decls were skipped
+ /// so that the ordering continues to match the private decls.
+ /// Must be called after generateTaskContextStruct().
+ SmallVector<llvm::Value *>
+ createGEPsToPrivateVars(llvm::Value *altStructPtr) const;
+
llvm::Value *isAllocated();
/// De-allocate the task context structure.
@@ -2303,28 +2310,36 @@ void TaskContextStructManager::generateTaskContextStruct() {
"omp.task.context_ptr");
}
-void TaskContextStructManager::createGEPsToPrivateVars() {
- if (!structPtr) {
- assert(privateVarTypes.empty());
- return;
- }
+SmallVector<llvm::Value *> TaskContextStructManager::createGEPsToPrivateVars(
+ llvm::Value *altStructPtr) const {
+ assert(!privateVarTypes.empty());
+ SmallVector<llvm::Value *> ret;
// Create GEPs for each struct member
- llvmPrivateVarGEPs.clear();
- llvmPrivateVarGEPs.reserve(privateDecls.size());
+ ret.reserve(privateDecls.size());
llvm::Value *zero = builder.getInt32(0);
unsigned i = 0;
for (auto privDecl : privateDecls) {
if (!privDecl.readsFromMold()) {
// Handle this inside of the task so we don't pass unnessecary vars in
- llvmPrivateVarGEPs.push_back(nullptr);
+ ret.push_back(nullptr);
continue;
}
llvm::Value *iVal = builder.getInt32(i);
- llvm::Value *gep = builder.CreateGEP(structTy, structPtr, {zero, iVal});
- llvmPrivateVarGEPs.push_back(gep);
+ llvm::Value *gep = builder.CreateGEP(structTy, altStructPtr, {zero, iVal});
+ ret.push_back(gep);
i += 1;
}
+ return ret;
+}
+
+void TaskContextStructManager::createGEPsToPrivateVars() {
+ if (!structPtr) {
+ assert(privateVarTypes.empty());
+ return;
+ }
+
+ llvmPrivateVarGEPs = createGEPsToPrivateVars(structPtr);
}
llvm::Value *TaskContextStructManager::isAllocated() {
@@ -2751,6 +2766,79 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return llvm::Error::success();
};
+ // Taskloop divides into an appropriate number of tasks by repeatedly
+ // duplicating the original task. Each time this is done, the task context
+ // structure must be duplicated too.
+ auto taskDupCB = [&](InsertPointTy AllocaIP, InsertPointTy CodegenIP,
+ llvm::Value *destPtr, llvm::Value *srcPtr)
+ -> llvm::Expected<llvm::IRBuilderBase::InsertPoint> {
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ builder.restoreIP(CodegenIP);
+
+ llvm::Type *ptrTy =
+ builder.getPtrTy(srcPtr->getType()->getPointerAddressSpace());
+ llvm::Value *src =
+ builder.CreateLoad(ptrTy, srcPtr, "omp.taskloop.context.src");
+
+ TaskContextStructManager &srcStructMgr = taskStructMgr;
+ TaskContextStructManager destStructMgr(builder, moduleTranslation,
+ privateVarsInfo.privatizers);
+ destStructMgr.generateTaskContextStruct();
+ llvm::Value *dest = destStructMgr.getStructPtr();
+ dest->setName("omp.taskloop.context.dest");
+ builder.CreateStore(dest, destPtr);
+
+ llvm::SmallVector<llvm::Value *> srcGEPs =
+ srcStructMgr.createGEPsToPrivateVars(src);
+ llvm::SmallVector<llvm::Value *> destGEPs =
+ destStructMgr.createGEPsToPrivateVars(dest);
+
+ // Inline init regions.
+ for (auto [privDecl, mold, blockArg, llvmPrivateVarAlloc] :
+ llvm::zip_equal(privateVarsInfo.privatizers, srcGEPs,
+ privateVarsInfo.blockArgs, destGEPs)) {
+ // To be handled inside task body.
+ if (!privDecl.readsFromMold())
+ continue;
+ assert(llvmPrivateVarAlloc &&
+ "reads from mold so shouldn't have been skipped");
+
+ llvm::Expected<llvm::Value *> privateVarOrErr =
+ initPrivateVar(builder, moduleTranslation, privDecl, mold, blockArg,
+ llvmPrivateVarAlloc, builder.GetInsertBlock());
+ if (!privateVarOrErr)
+ return privateVarOrErr.takeError();
+
+ setInsertPointForPossiblyEmptyBlock(builder);
+
+ // TODO: this is a bit of a hack for Fortran character boxes.
+ // Character boxes are passed by value into the init region and then the
+ // initialized character box is yielded by value. Here we need to store
+ // the yielded value into the private allocation, and load the private
+ // allocation to match the type expected by region block arguments.
+ if ((privateVarOrErr.get() != llvmPrivateVarAlloc) &&
+ !mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ builder.CreateStore(privateVarOrErr.get(), llvmPrivateVarAlloc);
+ // Load it so we have the value pointed to by the GEP
+ llvmPrivateVarAlloc = builder.CreateLoad(
+ privateVarOrErr.get()->getType(), llvmPrivateVarAlloc);
+ }
+ assert(llvmPrivateVarAlloc->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+
+ // Mapping blockArg -> llvmPrivateVarAlloc is done inside the body
+ // callback so that OpenMPIRBuilder doesn't try to pass each GEP address
+ // through a stack allocated structure.
+ }
+
+ if (failed(copyFirstPrivateVars(
+ &opInst, builder, moduleTranslation, srcGEPs, destGEPs,
+ privateVarsInfo.privatizers, taskloopOp.getPrivateNeedsBarrier())))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ return builder.saveIP();
+ };
+
auto loopOp = cast<omp::LoopNestOp>(taskloopOp.getWrappedLoop());
auto loopInfo = [&]() -> llvm::Expected<llvm::CanonicalLoopInfo *> {
@@ -2758,13 +2846,18 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return loopInfo;
};
+ llvm::OpenMPIRBuilder::TaskDupCallbackTy taskDupOrNull = nullptr;
+ if (!taskStructMgr.getLLVMPrivateVarGEPs().empty())
+ taskDupOrNull = taskDupCB;
+
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
moduleTranslation.getOpenMPBuilder()->createTaskloop(
ompLoc, allocaIP, bodyCB, loopInfo,
moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]),
moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]),
- moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]));
+ moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]),
+ /*Tied=*/true, taskDupOrNull, taskStructMgr.getStructPtr());
if (failed(handleError(afterIP, opInst)))
return failure();
>From 33a8f1580fad266941e8bc0b22789ec97e2e983e Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 14:31:02 +0000
Subject: [PATCH 09/14] Fix freeing private vars and context struct
I decided not to fix the TODO about zero iteration taskloops because
this is part of a larger problem affecting similar constructs e.g.
ordinary tasks with an if clause that evaluates to false.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 52 ++++++-------------
1 file changed, 17 insertions(+), 35 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index dc0462e88093b..c84f1e38127d7 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2252,8 +2252,6 @@ class TaskContextStructManager {
SmallVector<llvm::Value *>
createGEPsToPrivateVars(llvm::Value *altStructPtr) const;
- llvm::Value *isAllocated();
-
/// De-allocate the task context structure.
void freeStructPtr();
@@ -2342,26 +2340,13 @@ void TaskContextStructManager::createGEPsToPrivateVars() {
llvmPrivateVarGEPs = createGEPsToPrivateVars(structPtr);
}
-llvm::Value *TaskContextStructManager::isAllocated() {
- if (!structPtr)
- return nullptr;
-
- return builder.CreateIsNotNull(structPtr);
-}
-
void TaskContextStructManager::freeStructPtr() {
if (!structPtr)
return;
llvm::IRBuilderBase::InsertPointGuard guard{builder};
- llvm::BasicBlock *currentBlock = builder.GetInsertBlock();
- if (currentBlock->getTerminator()) {
- // Ensure we don't put the call to free() after the terminator
- builder.SetInsertPoint(currentBlock->getTerminator());
- } else {
- // Insert the call to free() at the end of the current block
- builder.SetInsertPoint(currentBlock);
- }
+ // Ensure we don't put the call to free() after the terminator
+ builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
builder.CreateFree(structPtr);
}
@@ -2635,6 +2620,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// Allocate and initialize private variables
builder.SetInsertPoint(initBlock->getTerminator());
+ // TODO: don't allocate if the loop has zero iterations.
taskStructMgr.generateTaskContextStruct();
taskStructMgr.createGEPsToPrivateVars();
@@ -2754,15 +2740,25 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
auto continuationBlockOrError =
convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region",
builder, moduleTranslation);
- ;
+
if (failed(handleError(continuationBlockOrError, opInst)))
return llvm::make_error<PreviouslyReportedError>();
builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
- // dummy check to ensure that the task context structure is accessed inside
- // the outlined fn.
- [[maybe_unused]] llvm::Value *cond = taskStructMgr.isAllocated();
+ // This is freeing the private variables as mapped inside of the task: these
+ // will be per-task private copies possibly after task duplication. This is
+ // handled transparently by how these are passed to the structure passed
+ // into the outlined function. When the task is duplicated, that structure
+ // is duplicated too.
+ if (failed(cleanupPrivateVars(builder, moduleTranslation,
+ taskloopOp.getLoc(), llvmFirstPrivateVars,
+ privateVarsInfo.privatizers)))
+ return llvm::make_error<PreviouslyReportedError>();
+ // Similarly, the task context structure freed inside the task is the
+ // per-task copy after task duplication.
+ taskStructMgr.freeStructPtr();
+
return llvm::Error::success();
};
@@ -2863,20 +2859,6 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return failure();
builder.restoreIP(*afterIP);
-
- // freeing the task context structure in exit block of taskloop.
- if (failed(cleanupPrivateVars(builder, moduleTranslation, taskloopOp.getLoc(),
- llvmFirstPrivateVars,
- privateVarsInfo.privatizers)))
- return failure();
-
- // Note: This free is valid because end_taskgroup waits until all generated
- // tasks are complete before returning. In the presence of Nogroup clause,
- // @__kmpc_taskgroup(..)/@__kmpc_end_taskgroup(..) is not called, have to
- // ensure that this freeStructPtr() is not called until every thread has
- // completed execution
- taskStructMgr.freeStructPtr();
-
return success();
}
>From 2aa198aefdedbaf560285e3920d9d5b6503e97bc Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 14:46:26 +0000
Subject: [PATCH 10/14] Fix indexing not to re-order private vars
This is important so that the private var cleanup applies the right
cleanup region to the right variable.
---
.../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c84f1e38127d7..17021110069a5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2625,12 +2625,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
taskStructMgr.createGEPsToPrivateVars();
llvmFirstPrivateVars.resize(privateVarsInfo.blockArgs.size());
- int index = 0;
- for (auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] :
- llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.mlirVars,
- privateVarsInfo.blockArgs,
- taskStructMgr.getLLVMPrivateVarGEPs())) {
+ for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
+ privateVarsInfo.privatizers, privateVarsInfo.mlirVars,
+ privateVarsInfo.blockArgs, taskStructMgr.getLLVMPrivateVarGEPs()))) {
+ auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] = zip;
// To be handled inside the taskloop.
if (!privDecl.readsFromMold())
continue;
@@ -2643,7 +2642,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
if (!privateVarOrErr)
return handleError(privateVarOrErr, *taskloopOp.getOperation());
- llvmFirstPrivateVars[index++] = privateVarOrErr.get();
+ llvmFirstPrivateVars[i] = privateVarOrErr.get();
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
@@ -2704,8 +2703,6 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return privateVarOrError.takeError();
moduleTranslation.mapValue(blockArg, privateVarOrError.get());
privateVarsInfo.llvmVars[i] = privateVarOrError.get();
- // Add private var to llvmFirstPrivateVars
- llvmFirstPrivateVars[index++] = privateVarOrError.get();
}
taskStructMgr.createGEPsToPrivateVars();
@@ -2752,7 +2749,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// into the outlined function. When the task is duplicated, that structure
// is duplicated too.
if (failed(cleanupPrivateVars(builder, moduleTranslation,
- taskloopOp.getLoc(), llvmFirstPrivateVars,
+ taskloopOp.getLoc(), privateVarsInfo.llvmVars,
privateVarsInfo.privatizers)))
return llvm::make_error<PreviouslyReportedError>();
// Similarly, the task context structure freed inside the task is the
>From 86383a36b1ad58ec0eba283369b193d6cb8be591 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 16:57:19 +0000
Subject: [PATCH 11/14] Fix loop trip count
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e32e92721d4d6..a71bcb0bb5392 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2321,7 +2321,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
- Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB);
+ Value *TripCountMinusOne =
+ Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
>From 6df8704f3474985c89f8ea5fdba63d11ed80d9a3 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 31 Dec 2025 13:47:00 +0000
Subject: [PATCH 12/14] [mlir][OpenMP] Implement OutlinableOpenMPOpInterface
for Taskloop
The body of taskloop is outlined and so OutlinableOpenMPOpInterface is
needed to ensure that language frontends know not to hoist allocas
outside of the body of taskloop.
The complication here is that taskloop is also a loop wrapper. Currently
some code assumes that taskloop contains only the wrapped loop, and so
there is no place to put the allocas other than in the loop body. This
is obviously not good. Unfortunately LLVM does not seem to be able to
hoist these allocas back out of the loop. The taskloop loop body will
need to contain stack saves and restores, which unfortunately hinder
some optimizations.
I think it is better to land some taskloop in LLVM 22 than not at all.
It will take more work to find an appropriate MLIR representation for
allocas inside of outlinable loop wrappers.
---
mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 1 +
mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td | 5 +++++
2 files changed, 6 insertions(+)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index bbfe805eefe48..1fcd7b3c23e10 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -948,6 +948,7 @@ def TaskOp
def TaskloopOp : OpenMP_Op<"taskloop", traits = [
AttrSizedOperandSegments, AutomaticAllocationScope,
DeclareOpInterfaceMethods<ComposableOpInterface>,
+ DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,
DeclareOpInterfaceMethods<LoopWrapperInterface>, NoTerminator,
RecursiveMemoryEffects, SingleBlock
], clauses = [
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index d471e6c0ed70b..fd500134e10f9 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -182,6 +182,11 @@ def OutlineableOpenMPOpInterface : OpInterface<"OutlineableOpenMPOpInterface"> {
let methods = [
InterfaceMethod<"Get alloca block", "::mlir::Block*", "getAllocaBlock",
(ins), [{
+ // For taskloop: put the allocas inside of the wrapped loop. Loop wrappers
+ // are expected to contain only the wrapped loop (or another loop wrapper)
+ if (LoopWrapperInterface loopWrapper =
+ mlir::dyn_cast<LoopWrapperInterface>($_op.getOperation()))
+ return &loopWrapper.getWrappedLoop()->getRegion(0).front();
return &$_op.getRegion().front();
}]>,
];
>From 257aaeaafe2059766fae62459336f3f2dc592421 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 18:41:12 +0000
Subject: [PATCH 13/14] Update test
---
mlir/test/Target/LLVMIR/openmp-taskloop.mlir | 234 +++++++++----------
1 file changed, 116 insertions(+), 118 deletions(-)
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
index 8179784a47d90..5f31c547e7485 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
@@ -32,122 +32,120 @@ llvm.func @_QPtest() {
llvm.return
}
-// CHECK: %struct.kmp_task_info = type { ptr, ptr, i32, ptr, ptr, i64, i64, i64 }
+// CHECK-LABEL: define void @_QPtest() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[VAL_0:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_1:.*]] = alloca i32, i64 1, align 4
+// CHECK: store i32 20, ptr %[[VAL_1]], align 4
+// CHECK: br label %[[VAL_2:.*]]
+// CHECK: entry: ; preds = %[[VAL_3:.*]]
+// CHECK: br label %[[VAL_4:.*]]
+// CHECK: omp.private.init: ; preds = %[[VAL_2]]
+// CHECK: %[[VAL_5:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
+// CHECK: %[[VAL_6:.*]] = getelementptr { i32 }, ptr %[[VAL_5]], i32 0, i32 0
+// CHECK: br label %[[VAL_7:.*]]
+// CHECK: omp.private.copy: ; preds = %[[VAL_4]]
+// CHECK: br label %[[VAL_8:.*]]
+// CHECK: omp.private.copy1: ; preds = %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK: store i32 %[[VAL_9]], ptr %[[VAL_6]], align 4
+// CHECK: br label %[[VAL_10:.*]]
+// CHECK: omp.taskloop.start: ; preds = %[[VAL_8]]
+// CHECK: br label %[[VAL_11:.*]]
+// CHECK: codeRepl: ; preds = %[[VAL_10]]
+// CHECK: %[[VAL_12:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK: store i64 1, ptr %[[VAL_12]], align 4
+// CHECK: %[[VAL_13:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 1
+// CHECK: store i64 5, ptr %[[VAL_13]], align 4
+// CHECK: %[[VAL_14:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 2
+// CHECK: store i64 1, ptr %[[VAL_14]], align 4
+// CHECK: %[[VAL_15:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 3
+// CHECK: store ptr %[[VAL_5]], ptr %[[VAL_15]], align 8
+// CHECK: %[[VAL_16:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[VAL_16]])
+// CHECK: %[[VAL_17:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[VAL_16]], i32 1, i64 40, i64 32, ptr @_QPtest..omp_par)
+// CHECK: %[[VAL_18:.*]] = load ptr, ptr %[[VAL_17]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_18]], ptr align 1 %[[STRUCTARG]], i64 32, i1 false)
+// CHECK: %[[VAL_19:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_18]], i32 0, i32 0
+// CHECK: %[[VAL_20:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_18]], i32 0, i32 1
+// CHECK: %[[VAL_21:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_18]], i32 0, i32 2
+// CHECK: %[[VAL_22:.*]] = load i64, ptr %[[VAL_21]], align 4
+// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[VAL_16]], ptr %[[VAL_17]], i32 1, ptr %[[VAL_19]], ptr %[[VAL_20]], i64 %[[VAL_22]], i32 1, i32 0, i64 0, ptr @omp_taskloop_dup)
+// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[VAL_16]])
+// CHECK: br label %[[VAL_23:.*]]
+// CHECK: taskloop.exit: ; preds = %[[VAL_11]]
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @_QPtest..omp_par(
+// CHECK: taskloop.alloca:
+// CHECK: %[[VAL_24:.*]] = load ptr, ptr %[[VAL_25:.*]], align 8
+// CHECK: %[[VAL_26:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 0
+// CHECK: %[[VAL_27:.*]] = load i64, ptr %[[VAL_26]], align 4
+// CHECK: %[[VAL_28:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 1
+// CHECK: %[[VAL_29:.*]] = load i64, ptr %[[VAL_28]], align 4
+// CHECK: %[[VAL_30:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 2
+// CHECK: %[[VAL_31:.*]] = load i64, ptr %[[VAL_30]], align 4
+// CHECK: %[[VAL_32:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 3
+// CHECK: %[[VAL_33:.*]] = load ptr, ptr %[[VAL_32]], align 8, !align !1
+// CHECK: %[[VAL_34:.*]] = alloca i32, align 4
+// CHECK: br label %[[VAL_35:.*]]
+// CHECK: taskloop.body: ; preds = %[[VAL_36:.*]]
+// CHECK: %[[VAL_37:.*]] = getelementptr { i32 }, ptr %[[VAL_33]], i32 0, i32 0
+// CHECK: br label %[[VAL_38:.*]]
+// CHECK: omp.taskloop.region: ; preds = %[[VAL_35]]
+// CHECK: br label %[[VAL_39:.*]]
+// CHECK: omp_loop.preheader: ; preds = %[[VAL_38]]
+// CHECK: %[[VAL_40:.*]] = sub i64 %[[VAL_29]], %[[VAL_27]]
+// CHECK: %[[VAL_41:.*]] = sdiv i64 %[[VAL_40]], 1
+// CHECK: %[[VAL_42:.*]] = add i64 %[[VAL_41]], 1
+// CHECK: %[[VAL_43:.*]] = trunc i64 %[[VAL_42]] to i32
+// CHECK: %[[VAL_44:.*]] = trunc i64 %[[VAL_27]] to i32
+// CHECK: br label %[[VAL_45:.*]]
+// CHECK: omp_loop.header: ; preds = %[[VAL_46:.*]], %[[VAL_39]]
+// CHECK: %[[VAL_47:.*]] = phi i32 [ 0, %[[VAL_39]] ], [ %[[VAL_48:.*]], %[[VAL_46]] ]
+// CHECK: br label %[[VAL_49:.*]]
+// CHECK: omp_loop.cond: ; preds = %[[VAL_45]]
+// CHECK: %[[VAL_50:.*]] = icmp ult i32 %[[VAL_47]], %[[VAL_43]]
+// CHECK: br i1 %[[VAL_50]], label %[[VAL_51:.*]], label %[[VAL_52:.*]]
+// CHECK: omp_loop.exit: ; preds = %[[VAL_49]]
+// CHECK: br label %[[VAL_53:.*]]
+// CHECK: omp_loop.after: ; preds = %[[VAL_52]]
+// CHECK: br label %[[VAL_54:.*]]
+// CHECK: omp.region.cont: ; preds = %[[VAL_53]]
+// CHECK: tail call void @free(ptr %[[VAL_33]])
+// CHECK: br label %[[VAL_55:.*]]
+// CHECK: omp_loop.body: ; preds = %[[VAL_49]]
+// CHECK: %[[VAL_56:.*]] = mul i32 %[[VAL_47]], 1
+// CHECK: %[[VAL_57:.*]] = add i32 %[[VAL_56]], %[[VAL_44]]
+// CHECK: br label %[[VAL_58:.*]]
+// CHECK: omp.loop_nest.region: ; preds = %[[VAL_51]]
+// CHECK: store i32 %[[VAL_57]], ptr %[[VAL_34]], align 4
+// CHECK: %[[VAL_59:.*]] = load i32, ptr %[[VAL_37]], align 4
+// CHECK: %[[VAL_60:.*]] = add i32 %[[VAL_59]], 1
+// CHECK: store i32 %[[VAL_60]], ptr %[[VAL_37]], align 4
+// CHECK: br label %[[VAL_61:.*]]
+// CHECK: omp.region.cont2: ; preds = %[[VAL_58]]
+// CHECK: br label %[[VAL_46]]
+// CHECK: omp_loop.inc: ; preds = %[[VAL_61]]
+// CHECK: %[[VAL_48]] = add nuw i32 %[[VAL_47]], 1
+// CHECK: br label %[[VAL_45]]
+// CHECK: taskloop.exit.exitStub: ; preds = %[[VAL_54]]
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @omp_taskloop_dup(
+// CHECK: entry:
+// CHECK: %[[VAL_62:.*]] = getelementptr { %[[VAL_63:.*]], { i64, i64, i64, ptr } }, ptr %[[VAL_64:.*]], i32 0, i32 1
+// CHECK: %[[VAL_65:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_62]], i32 0, i32 3
+// CHECK: %[[VAL_66:.*]] = getelementptr { %[[VAL_63]], { i64, i64, i64, ptr } }, ptr %[[VAL_67:.*]], i32 0, i32 1
+// CHECK: %[[VAL_68:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_66]], i32 0, i32 3
+// CHECK: %[[VAL_69:.*]] = load ptr, ptr %[[VAL_68]], align 8
+// CHECK: %[[VAL_70:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
+// CHECK: store ptr %[[VAL_70]], ptr %[[VAL_65]], align 8
+// CHECK: %[[VAL_71:.*]] = getelementptr { i32 }, ptr %[[VAL_69]], i32 0, i32 0
+// CHECK: %[[VAL_72:.*]] = getelementptr { i32 }, ptr %[[VAL_70]], i32 0, i32 0
+// CHECK: br label %[[VAL_73:.*]]
+// CHECK: omp.private.copy: ; preds = %[[VAL_74:.*]]
+// CHECK: %[[VAL_75:.*]] = load i32, ptr %[[VAL_71]], align 4
+// CHECK: store i32 %[[VAL_75]], ptr %[[VAL_72]], align 4
+// CHECK: ret void
-// CHECK-LABEL: define void @_QPtest() {
-// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
-// CHECK: %[[VAL1:.*]] = alloca i32, i64 1, align 4
-// CHECK: %[[VAL_X:.*]] = alloca i32, i64 1, align 4
-// CHECK: store i32 20, ptr %[[VAL_X]], align 4
-// CHECK: br label %entry
-
-// CHECK: entry:
-// CHECK: br label %omp.private.init
-
-// CHECK: omp.private.init: ; preds = %entry
-// CHECK: %[[OMP_TASK_CONTEXT_PTR:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
-// CHECK: %[[PRIV_GEP:.*]] = getelementptr { i32 }, ptr %[[OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
-// CHECK: br label %omp.private.copy
-
-// CHECK: omp.private.copy:
-// CHECK: br label %omp.private.copy1
-
-// CHECK: omp.private.copy1:
-// CHECK: %[[LOAD_X:.*]] = load i32, ptr %[[VAL_X]], align 4
-// CHECK: store i32 %[[LOAD_X]], ptr %[[PRIV_GEP]], align 4
-// CHECK: br label %omp.taskloop.start
-
-// CHECK: omp.taskloop.start:
-// CHECK: br label %codeRepl
-
-// CHECK: codeRepl:
-// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
-// CHECK: store ptr %[[OMP_TASK_CONTEXT_PTR]], ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8
-// CHECK: %[[GTID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]])
-// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par)
-// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5
-// CHECK: store i64 1, ptr %[[LB_GEP]], align 4
-// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6
-// CHECK: store i64 5, ptr %[[UB_GEP]], align 4
-// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7
-// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4
-// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4
-// CHECK: %10 = load ptr, ptr %[[TASK_PTR]], align 8
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %10, ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
-// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[GTID]], ptr %[[TASK_PTR]], i32 1, ptr %[[LB_GEP]], ptr %[[UB_GEP]], i64 %[[LOAD_STEP]], i32 1, i32 0, i64 0, ptr null)
-// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[GTID]])
-// CHECK: br label %taskloop.exit
-
-// CHECK: taskloop.exit:
-// CHECK: tail call void @free(ptr %[[OMP_TASK_CONTEXT_PTR]])
-// CHECK: ret void
-// CHECK: }
-
-// CHECK-LABEL: define internal void @_QPtest..omp_par
-// CHECK-SAME: i32 %[[GLOBAL_TID:.*]], ptr %[[TASK_PTR1:.*]]) {
-// CHECK: taskloop.alloca:
-// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8
-// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5
-// CHECK: %[[LOAD_LB64:.*]] = load i64, ptr %[[GEP_LB]], align 4
-// CHECK: %[[LB:.*]] = trunc i64 %[[LOAD_LB64]] to i32
-// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6
-// CHECK: %[[LOAD_UB64:.*]] = load i64, ptr %[[GEP_UB]], align 4
-// CHECK: %[[UB:.*]] = trunc i64 %[[LOAD_UB64]] to i32
-// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0
-// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1
-// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4
-// CHECK: br label %taskloop.body
-
-// CHECK: taskloop.body:
-// CHECK: %[[LOAD_X:.*]] = getelementptr { i32 }, ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
-// CHECK: br label %omp.taskloop.region
-
-// CHECK: omp.taskloop.region:
-// CHECK: br label %omp_loop.preheader
-
-// CHECK: omp_loop.preheader:
-// CHECK: %[[VAL2:.*]] = sub i32 %[[UB]], %[[LB]]
-// CHECK: %[[TRIP_CNT:.*]] = add i32 %[[VAL2]], 1
-// CHECK: br label %omp_loop.header
-
-// CHECK: omp_loop.header:
-// CHECK: %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
-// CHECK: br label %omp_loop.cond
-
-// CHECK: omp_loop.cond:
-// CHECK: %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[TRIP_CNT]]
-// CHECK: br i1 %[[OMP_LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit
-
-// CHECK: omp_loop.exit:
-// CHECK: br label %omp_loop.after
-
-// CHECK: omp_loop.after:
-// CHECK: br label %omp.region.cont
-
-// CHECK: omp.region.cont:
-// CHECK: %[[IS_ALLOCATED:.*]] = icmp ne ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], null
-// CHECK: br label %taskloop.exit.exitStub
-
-// CHECK: omp_loop.body:
-// CHECK: %[[VAL3:.*]] = mul i32 %[[OMP_LOOP_IV]], 1
-// CHECK: %[[VAL5:.*]] = add i32 %[[VAL3]], %[[LB]]
-// CHECK: br label %omp.loop_nest.region
-
-// CHECK: omp.loop_nest.region:
-// CHECK: store i32 %[[VAL5]], ptr %[[OMP_PRIVATE_ALLOC]], align 4
-// CHECK: %[[VAL6:.*]] = load i32, ptr %[[LOAD_X]], align 4
-// CHECK: %[[RES:.*]] = add i32 %[[VAL6]], 1
-// CHECK: store i32 %[[RES]], ptr %[[LOAD_X]], align 4
-// CHECK: br label %omp.region.cont2
-
-// CHECK: omp.region.cont2:
-// CHECK: br label %omp_loop.inc
-
-// CHECK: omp_loop.inc:
-// CHECK: %omp_loop.next = add nuw i32 %[[OMP_LOOP_IV]], 1
-// CHECK: br label %omp_loop.header
-
-// CHECK: taskloop.exit.exitStub:
-// CHECK: ret void
-// CHECK: }
\ No newline at end of file
>From 6841450d63b40c57c2e84796bd9f42a2cee495fd Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Mon, 5 Jan 2026 09:55:40 +0000
Subject: [PATCH 14/14] [NFC] Share body generation callback between task and
taskloop
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 273 +++++++-----------
1 file changed, 101 insertions(+), 172 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 17021110069a5..1bb06f8f9cfe7 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2350,11 +2350,105 @@ void TaskContextStructManager::freeStructPtr() {
builder.CreateFree(structPtr);
}
+using TaskLikeBodyGenCallbackTy =
+ std::function<llvm::Error(llvm::OpenMPIRBuilder::InsertPointTy allocaIP,
+ llvm::OpenMPIRBuilder::InsertPointTy codegenIP)>;
+
+/// Build the body generation callback shared by task-like constructs (task and
+/// taskloop).
+static TaskLikeBodyGenCallbackTy buildTaskLikeBodyGenCallback(
+ Operation *opInst, Region ®ion, StringRef regionName,
+ llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
+ PrivateVarsInfo &privateVarsInfo, TaskContextStructManager &taskStructMgr) {
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ return [&, regionName](InsertPointTy allocaIP,
+ InsertPointTy codegenIP) -> llvm::Error {
+ // Save the alloca insertion point on ModuleTranslation stack for use in
+ // nested regions.
+ LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+ moduleTranslation, allocaIP);
+
+ // translate the body of the task:
+ builder.restoreIP(codegenIP);
+
+ llvm::BasicBlock *privInitBlock = nullptr;
+ privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
+ for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
+ privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
+ privateVarsInfo.mlirVars))) {
+ auto [blockArg, privDecl, mlirPrivVar] = zip;
+ // This is handled before the task executes
+ if (privDecl.readsFromMold())
+ continue;
+
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ llvm::Type *llvmAllocType =
+ moduleTranslation.convertType(privDecl.getType());
+ builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+ llvm::Value *llvmPrivateVar = builder.CreateAlloca(
+ llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
+
+ llvm::Expected<llvm::Value *> privateVarOrError =
+ initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
+ blockArg, llvmPrivateVar, privInitBlock);
+ if (!privateVarOrError)
+ return privateVarOrError.takeError();
+ moduleTranslation.mapValue(blockArg, privateVarOrError.get());
+ privateVarsInfo.llvmVars[i] = privateVarOrError.get();
+ }
+
+ taskStructMgr.createGEPsToPrivateVars();
+ for (auto [i, llvmPrivVar] :
+ llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
+ if (!llvmPrivVar) {
+ assert(privateVarsInfo.llvmVars[i] &&
+ "This is added in the loop above");
+ continue;
+ }
+ privateVarsInfo.llvmVars[i] = llvmPrivVar;
+ }
+
+ // Find and map the addresses of each variable within the task context
+ // structure
+ for (auto [blockArg, llvmPrivateVar, privateDecl] :
+ llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
+ privateVarsInfo.privatizers)) {
+ // This was handled above.
+ if (!privateDecl.readsFromMold())
+ continue;
+ // Fix broken pass-by-value case for Fortran character boxes
+ if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ llvmPrivateVar = builder.CreateLoad(
+ moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
+ }
+ assert(llvmPrivateVar->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+ moduleTranslation.mapValue(blockArg, llvmPrivateVar);
+ }
+
+ auto continuationBlockOrError =
+ convertOmpOpRegions(region, regionName, builder, moduleTranslation);
+ if (failed(handleError(continuationBlockOrError, *opInst)))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
+
+ if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst->getLoc(),
+ privateVarsInfo.llvmVars,
+ privateVarsInfo.privatizers)))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ // Free heap allocated task context structure at the end of the task.
+ taskStructMgr.freeStructPtr();
+
+ return llvm::Error::success();
+ };
+}
+
/// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) {
- using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
if (failed(checkImplementationStatus(*taskOp)))
return failure();
@@ -2467,88 +2561,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
// Set up for call to createTask()
builder.SetInsertPoint(taskStartBlock);
- auto bodyCB = [&](InsertPointTy allocaIP,
- InsertPointTy codegenIP) -> llvm::Error {
- // Save the alloca insertion point on ModuleTranslation stack for use in
- // nested regions.
- LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
- moduleTranslation, allocaIP);
-
- // translate the body of the task:
- builder.restoreIP(codegenIP);
-
- llvm::BasicBlock *privInitBlock = nullptr;
- privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
- for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
- privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
- privateVarsInfo.mlirVars))) {
- auto [blockArg, privDecl, mlirPrivVar] = zip;
- // This is handled before the task executes
- if (privDecl.readsFromMold())
- continue;
-
- llvm::IRBuilderBase::InsertPointGuard guard(builder);
- llvm::Type *llvmAllocType =
- moduleTranslation.convertType(privDecl.getType());
- builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
- llvm::Value *llvmPrivateVar = builder.CreateAlloca(
- llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
-
- llvm::Expected<llvm::Value *> privateVarOrError =
- initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
- blockArg, llvmPrivateVar, privInitBlock);
- if (!privateVarOrError)
- return privateVarOrError.takeError();
- moduleTranslation.mapValue(blockArg, privateVarOrError.get());
- privateVarsInfo.llvmVars[i] = privateVarOrError.get();
- }
-
- taskStructMgr.createGEPsToPrivateVars();
- for (auto [i, llvmPrivVar] :
- llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
- if (!llvmPrivVar) {
- assert(privateVarsInfo.llvmVars[i] &&
- "This is added in the loop above");
- continue;
- }
- privateVarsInfo.llvmVars[i] = llvmPrivVar;
- }
-
- // Find and map the addresses of each variable within the task context
- // structure
- for (auto [blockArg, llvmPrivateVar, privateDecl] :
- llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)) {
- // This was handled above.
- if (!privateDecl.readsFromMold())
- continue;
- // Fix broken pass-by-value case for Fortran character boxes
- if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
- llvmPrivateVar = builder.CreateLoad(
- moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
- }
- assert(llvmPrivateVar->getType() ==
- moduleTranslation.convertType(blockArg.getType()));
- moduleTranslation.mapValue(blockArg, llvmPrivateVar);
- }
-
- auto continuationBlockOrError = convertOmpOpRegions(
- taskOp.getRegion(), "omp.task.region", builder, moduleTranslation);
- if (failed(handleError(continuationBlockOrError, *taskOp)))
- return llvm::make_error<PreviouslyReportedError>();
-
- builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
-
- if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(),
- privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)))
- return llvm::make_error<PreviouslyReportedError>();
-
- // Free heap allocated task context structure at the end of the task.
- taskStructMgr.freeStructPtr();
-
- return llvm::Error::success();
- };
+ auto bodyCB = buildTaskLikeBodyGenCallback(
+ taskOp, taskOp.getRegion(), "omp.task.region", builder, moduleTranslation,
+ privateVarsInfo, taskStructMgr);
llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder();
SmallVector<llvm::BranchInst *> cancelTerminators;
@@ -2669,95 +2684,9 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// Set up inserttion point for call to createTaskloop()
builder.SetInsertPoint(taskloopStartBlock);
- auto bodyCB = [&](InsertPointTy allocaIP,
- InsertPointTy codegenIP) -> llvm::Error {
- // Save the alloca insertion point on ModuleTranslation stack for use in
- // nested regions.
- LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
- moduleTranslation, allocaIP);
-
- // translate the body of the taskloop:
- builder.restoreIP(codegenIP);
-
- llvm::BasicBlock *privInitBlock = nullptr;
- privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
- for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
- privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
- privateVarsInfo.mlirVars))) {
- auto [blockArg, privDecl, mlirPrivVar] = zip;
- // This is handled before the task executes
- if (privDecl.readsFromMold())
- continue;
-
- llvm::IRBuilderBase::InsertPointGuard guard(builder);
- llvm::Type *llvmAllocType =
- moduleTranslation.convertType(privDecl.getType());
- builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
- llvm::Value *llvmPrivateVar = builder.CreateAlloca(
- llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
-
- llvm::Expected<llvm::Value *> privateVarOrError =
- initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
- blockArg, llvmPrivateVar, privInitBlock);
- if (!privateVarOrError)
- return privateVarOrError.takeError();
- moduleTranslation.mapValue(blockArg, privateVarOrError.get());
- privateVarsInfo.llvmVars[i] = privateVarOrError.get();
- }
-
- taskStructMgr.createGEPsToPrivateVars();
- for (auto [i, llvmPrivVar] :
- llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
- if (!llvmPrivVar) {
- assert(privateVarsInfo.llvmVars[i] &&
- "This is added in the loop above");
- continue;
- }
- privateVarsInfo.llvmVars[i] = llvmPrivVar;
- }
-
- // Find and map the addresses of each variable within the taskloop context
- // structure
- for (auto [blockArg, llvmPrivateVar, privateDecl] :
- llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)) {
- // This was handled above.
- if (!privateDecl.readsFromMold())
- continue;
- // Fix broken pass-by-value case for Fortran character boxes
- if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
- llvmPrivateVar = builder.CreateLoad(
- moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
- }
- assert(llvmPrivateVar->getType() ==
- moduleTranslation.convertType(blockArg.getType()));
- moduleTranslation.mapValue(blockArg, llvmPrivateVar);
- }
-
- auto continuationBlockOrError =
- convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region",
- builder, moduleTranslation);
-
- if (failed(handleError(continuationBlockOrError, opInst)))
- return llvm::make_error<PreviouslyReportedError>();
-
- builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
-
- // This is freeing the private variables as mapped inside of the task: these
- // will be per-task private copies possibly after task duplication. This is
- // handled transparently by how these are passed to the structure passed
- // into the outlined function. When the task is duplicated, that structure
- // is duplicated too.
- if (failed(cleanupPrivateVars(builder, moduleTranslation,
- taskloopOp.getLoc(), privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)))
- return llvm::make_error<PreviouslyReportedError>();
- // Similarly, the task context structure freed inside the task is the
- // per-task copy after task duplication.
- taskStructMgr.freeStructPtr();
-
- return llvm::Error::success();
- };
+ auto bodyCB = buildTaskLikeBodyGenCallback(
+ &opInst, taskloopOp.getRegion(), "omp.taskloop.region", builder,
+ moduleTranslation, privateVarsInfo, taskStructMgr);
// Taskloop divides into an appropriate number of tasks by repeatedly
// duplicating the original task. Each time this is done, the task context
More information about the Mlir-commits
mailing list