[Mlir-commits] [llvm] [mlir] [mlir][OpenMP] Translation support for taskloop construct (PR #174386)
Tom Eccles
llvmlistbot at llvm.org
Wed Jan 7 08:12:15 PST 2026
https://github.com/tblah updated https://github.com/llvm/llvm-project/pull/174386
>From 292f43fe5b4c294da7d6dc47c24edde3bfa5296f Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <kaviyara2000 at gmail.com>
Date: Fri, 7 Nov 2025 12:49:53 +0530
Subject: [PATCH 01/16] [Flang][OpenMP] Translation support for taskloop
construct
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 16 ++
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 1 +
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 199 ++++++++++++++
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 257 +++++++++++++++++-
mlir/test/Target/LLVMIR/openmp-taskloop.mlir | 151 ++++++++++
mlir/test/Target/LLVMIR/openmp-todo.mlir | 15 +-
6 files changed, 622 insertions(+), 17 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-taskloop.mlir
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d6fc49afb6fdb..88b698bc71874 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1404,6 +1404,22 @@ class OpenMPIRBuilder {
: DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
};
+ /// Generator for `#omp taskloop`
+ ///
+ /// \param Loc The location where the taskloop construct was encountered.
+ /// \param AllocaIP The insertion point to be used for alloca instructions.
+ /// \param BodyGenCB Callback that will generate the region code.
+ /// \param LoopInfo Callback that return the CLI
+ /// \param LBVal Lowerbound value of loop
+ /// \param UBVal Upperbound value of loop
+ /// \param StepVal Step value of loop
+ /// \param Tied True if the task is tied, false if the task is untied.
+ LLVM_ABI InsertPointOrErrorTy createTaskloop(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ BodyGenCallbackTy BodyGenCB,
+ llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true);
+
/// Generator for `#omp task`
///
/// \param Loc The location where the task construct was encountered.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 152a8f727310a..bb12c1558766b 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,6 +95,7 @@ __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidP
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
+__OMP_STRUCT_TYPE(Taskloop, kmp_task_info, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr, Int64, Int64, Int64)
__OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
Int8, Int8, Int8, Int32, Int32, Int32, Int32, Int32, Int32)
__OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index a3e7c5ea8059b..2d7c50ece7199 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2025,6 +2025,205 @@ static Value *emitTaskDependencies(
return DepArray;
}
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ BodyGenCallbackTy BodyGenCB,
+ llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> loopInfo,
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) {
+
+ if (!updateToLocation(Loc))
+ return InsertPointTy();
+
+ uint32_t SrcLocStrSize;
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+ Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+
+ BasicBlock *TaskloopExitBB =
+ splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
+ BasicBlock *TaskloopBodyBB =
+ splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
+ BasicBlock *TaskloopAllocaBB =
+ splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
+
+ InsertPointTy TaskloopAllocaIP =
+ InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
+ InsertPointTy TaskloopBodyIP =
+ InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
+
+ if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
+ return Err;
+
+ llvm::Expected<llvm::CanonicalLoopInfo *> result = loopInfo();
+ if (!result) {
+ return result.takeError();
+ }
+
+ llvm::CanonicalLoopInfo *CLI = result.get();
+ OutlineInfo OI;
+ OI.EntryBB = TaskloopAllocaBB;
+ OI.OuterAllocaBB = AllocaIP.getBlock();
+ OI.ExitBB = TaskloopExitBB;
+
+ // Add the thread ID argument.
+ SmallVector<Instruction *, 4> ToBeDeleted;
+ // dummy instruction to be used as a fake argument
+ OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
+ Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
+
+ OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
+ TaskloopAllocaBB, CLI, Loc,
+ ToBeDeleted](Function &OutlinedFn) mutable {
+ // Replace the Stale CI by appropriate RTL function call.
+ assert(OutlinedFn.hasOneUse() &&
+ "there must be a single user for the outlined function");
+ CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+
+ // HasShareds is true if any variables are captured in the outlined region,
+ // false otherwise.
+ bool HasShareds = StaleCI->arg_size() > 1;
+ Builder.SetInsertPoint(StaleCI);
+
+ // Gather the arguments for emitting the runtime call for
+ // @__kmpc_omp_task_alloc
+ Function *TaskAllocFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+
+ Value *ThreadID = getOrCreateThreadID(Ident);
+
+ // Emit runtime call for @__kmpc_taskgroup
+ Function *TaskgroupFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
+ Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
+
+ // The flags are set to 1 if the task is tied, 0 otherwise.
+ Value *Flags = Builder.getInt32(Tied);
+
+ Value *TaskSize = Builder.getInt64(
+ divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8));
+
+ Value *SharedsSize = Builder.getInt64(0);
+ if (HasShareds) {
+ AllocaInst *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ assert(ArgStructAlloca &&
+ "Unable to find the alloca instruction corresponding to arguments "
+ "for extracted function");
+ StructType *ArgStructType =
+ dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+ assert(ArgStructType && "Unable to find struct type corresponding to "
+ "arguments for extracted function");
+ SharedsSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
+ }
+
+ // Emit the @__kmpc_omp_task_alloc runtime call
+ // The runtime call returns a pointer to an area where the task captured
+ // variables must be copied before the task is run (TaskData)
+ CallInst *TaskData = Builder.CreateCall(
+ TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
+ /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
+ /*task_func=*/&OutlinedFn});
+
+ // Get the pointer to loop lb, ub, step from task ptr
+ // and set up the lowerbound,upperbound and step values
+ llvm::Value *lb =
+ Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5);
+ // Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
+ Builder.CreateStore(LBVal, lb);
+
+ llvm::Value *ub =
+ Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6);
+ Builder.CreateStore(UBVal, ub);
+
+ llvm::Value *step =
+ Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7);
+ Value *Step_ext = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
+ Builder.CreateStore(Step_ext, step);
+ llvm::Value *loadstep = Builder.CreateLoad(Builder.getInt64Ty(), step);
+
+ if (HasShareds) {
+ Value *Shareds = StaleCI->getArgOperand(1);
+ Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+ Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+ Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+ SharedsSize);
+ }
+
+ // set up the arguments for emitting kmpc_taskloop runtime call
+ // setting default values for ifval, nogroup, sched, grainsize, task_dup
+ Value *IfVal = Builder.getInt32(1);
+ Value *NoGroup = Builder.getInt32(1);
+ Value *Sched = Builder.getInt32(0);
+ Value *GrainSize = Builder.getInt64(0);
+ Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
+
+ Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub,
+ loadstep, NoGroup, Sched, GrainSize, TaskDup};
+
+ // taskloop runtime call
+ Function *TaskloopFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
+ Builder.CreateCall(TaskloopFn, Args);
+
+ // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
+ Function *EndTaskgroupFn =
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
+ Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
+
+ StaleCI->eraseFromParent();
+
+ Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
+
+ if (HasShareds) {
+ LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
+ OutlinedFn.getArg(1)->replaceUsesWithIf(
+ Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
+ }
+
+ Value *IV = CLI->getIndVar();
+ Type *IVTy = IV->getType();
+ Constant *One = ConstantInt::get(IVTy, 1);
+
+ Value *task_lb = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 5, "gep_lb");
+ Value *LowerBound = Builder.CreateLoad(IVTy, task_lb, "lb");
+
+ Value *task_ub = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 6, "gep_ub");
+ Value *UpperBound = Builder.CreateLoad(IVTy, task_ub, "ub");
+
+ Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
+
+ Value *TripCountMinusOne = Builder.CreateSub(UpperBound, LowerBound);
+ Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
+ // set the trip count in the CLI
+ CLI->setTripCount(TripCount);
+
+ Builder.SetInsertPoint(CLI->getBody(),
+ CLI->getBody()->getFirstInsertionPt());
+
+ llvm::BasicBlock *Body = CLI->getBody();
+ for (llvm::Instruction &I : *Body) {
+ if (auto *Add = llvm::dyn_cast<llvm::BinaryOperator>(&I)) {
+ if (Add->getOpcode() == llvm::Instruction::Add) {
+ if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
+ // update the starting index of the loop
+ Add->setOperand(1, LowerBound);
+ }
+ }
+ }
+ }
+
+ for (Instruction *I : llvm::reverse(ToBeDeleted)) {
+ I->eraseFromParent();
+ }
+ };
+
+ addOutlineInfo(std::move(OI));
+ Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
+ return Builder.saveIP();
+}
+
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 55df986ad3d11..dac63dccb7a3c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -353,6 +353,26 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getDevice())
result = todo("device");
};
+ auto checkFinal = [&todo](auto op, LogicalResult &result) {
+ if (op.getFinal())
+ result = todo("final");
+ };
+ auto checkGrainsize = [&todo](auto op, LogicalResult &result) {
+ if (op.getGrainsize())
+ result = todo("grainsize");
+ };
+ auto checkIf = [](auto op, LogicalResult &) {
+ if (op.getIfExpr())
+ op.emitWarning("if");
+ };
+ auto checkMergeable = [&todo](auto op, LogicalResult &result) {
+ if (op.getMergeable())
+ result = todo("mergeable");
+ };
+ auto checkNogroup = [&todo](auto op, LogicalResult &result) {
+ if (op.getNogroup())
+ result = todo("nogroup");
+ };
auto checkHint = [](auto op, LogicalResult &) {
if (op.getHint())
op.emitWarning("hint clause discarded");
@@ -366,6 +386,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getNowait())
result = todo("nowait");
};
+ auto checkNumTasks = [&todo](auto op, LogicalResult &result) {
+ if (op.getNumTasks())
+ result = todo("num_tasks");
+ };
auto checkOrder = [&todo](auto op, LogicalResult &result) {
if (op.getOrder() || op.getOrderMod())
result = todo("order");
@@ -438,7 +462,15 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkNowait(op, result);
})
.Case([&](omp::TaskloopOp op) {
- // TODO: Add other clauses check
+ checkAllocate(op, result);
+ checkFinal(op, result);
+ checkGrainsize(op, result);
+ checkIf(op, result);
+ checkInReduction(op, result);
+ checkMergeable(op, result);
+ checkNogroup(op, result);
+ checkNumTasks(op, result);
+ checkReduction(op, result);
checkUntied(op, result);
checkPriority(op, result);
})
@@ -2201,6 +2233,8 @@ class TaskContextStructManager {
/// private decls.
void createGEPsToPrivateVars();
+ llvm::Value *isAllocated();
+
/// De-allocate the task context structure.
void freeStructPtr();
@@ -2281,13 +2315,26 @@ void TaskContextStructManager::createGEPsToPrivateVars() {
}
}
+llvm::Value *TaskContextStructManager::isAllocated() {
+ if (!structPtr)
+ return nullptr;
+
+ return builder.CreateIsNotNull(structPtr);
+}
+
void TaskContextStructManager::freeStructPtr() {
if (!structPtr)
return;
llvm::IRBuilderBase::InsertPointGuard guard{builder};
- // Ensure we don't put the call to free() after the terminator
- builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+ llvm::BasicBlock *currentBlock = builder.GetInsertBlock();
+ if (currentBlock->getTerminator()) {
+ // Ensure we don't put the call to free() after the terminator
+ builder.SetInsertPoint(currentBlock->getTerminator());
+ } else {
+ // Insert the call to free() at the end of the current block
+ builder.SetInsertPoint(currentBlock);
+ }
builder.CreateFree(structPtr);
}
@@ -2523,6 +2570,207 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
return success();
}
+// Converts an OpenMP taskloop construct into LLVM IR using OpenMPIRBuilder.
+static LogicalResult
+convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ auto taskloopOp = cast<omp::TaskloopOp>(opInst);
+ if (failed(checkImplementationStatus(opInst)))
+ return failure();
+
+ // It stores the pointer of allocated firstprivate copies,
+ // which can be used later for freeing the allocated space.
+ SmallVector<llvm::Value *> llvmFirstPrivateVars;
+ PrivateVarsInfo privateVarsInfo(taskloopOp);
+ TaskContextStructManager taskStructMgr{builder, moduleTranslation,
+ privateVarsInfo.privatizers};
+
+ llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+ findAllocaInsertPoint(builder, moduleTranslation);
+
+ assert(builder.GetInsertPoint() == builder.GetInsertBlock()->end());
+ llvm::BasicBlock *taskloopStartBlock = llvm::BasicBlock::Create(
+ builder.getContext(), "omp.taskloop.start",
+ /*Parent=*/builder.GetInsertBlock()->getParent());
+ llvm::Instruction *branchToTaskloopStartBlock =
+ builder.CreateBr(taskloopStartBlock);
+ builder.SetInsertPoint(branchToTaskloopStartBlock);
+
+ llvm::BasicBlock *copyBlock =
+ splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
+ llvm::BasicBlock *initBlock =
+ splitBB(builder, /*CreateBranch=*/true, "omp.private.init");
+
+ LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+ moduleTranslation, allocaIP);
+
+ // Allocate and initialize private variables
+ builder.SetInsertPoint(initBlock->getTerminator());
+
+ taskStructMgr.generateTaskContextStruct();
+ taskStructMgr.createGEPsToPrivateVars();
+
+ llvmFirstPrivateVars.resize(privateVarsInfo.blockArgs.size());
+ int index = 0;
+
+ for (auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] :
+ llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.mlirVars,
+ privateVarsInfo.blockArgs,
+ taskStructMgr.getLLVMPrivateVarGEPs())) {
+ // To be handled inside the taskloop.
+ if (!privDecl.readsFromMold())
+ continue;
+ assert(llvmPrivateVarAlloc &&
+ "reads from mold so shouldn't have been skipped");
+
+ llvm::Expected<llvm::Value *> privateVarOrErr =
+ initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
+ blockArg, llvmPrivateVarAlloc, initBlock);
+ if (!privateVarOrErr)
+ return handleError(privateVarOrErr, *taskloopOp.getOperation());
+
+ llvmFirstPrivateVars[index++] = privateVarOrErr.get();
+
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+
+ if ((privateVarOrErr.get() != llvmPrivateVarAlloc) &&
+ !mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ builder.CreateStore(privateVarOrErr.get(), llvmPrivateVarAlloc);
+ // Load it so we have the value pointed to by the GEP
+ llvmPrivateVarAlloc = builder.CreateLoad(privateVarOrErr.get()->getType(),
+ llvmPrivateVarAlloc);
+ }
+ assert(llvmPrivateVarAlloc->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+ }
+
+ // firstprivate copy region
+ setInsertPointForPossiblyEmptyBlock(builder, copyBlock);
+ if (failed(copyFirstPrivateVars(
+ taskloopOp, builder, moduleTranslation, privateVarsInfo.mlirVars,
+ taskStructMgr.getLLVMPrivateVarGEPs(), privateVarsInfo.privatizers,
+ taskloopOp.getPrivateNeedsBarrier())))
+ return llvm::failure();
+
+ // Set up inserttion point for call to createTaskloop()
+ builder.SetInsertPoint(taskloopStartBlock);
+
+ auto bodyCB = [&](InsertPointTy allocaIP,
+ InsertPointTy codegenIP) -> llvm::Error {
+ // Save the alloca insertion point on ModuleTranslation stack for use in
+ // nested regions.
+ LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+ moduleTranslation, allocaIP);
+
+ // translate the body of the taskloop:
+ builder.restoreIP(codegenIP);
+
+ llvm::BasicBlock *privInitBlock = nullptr;
+ privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
+ for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
+ privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
+ privateVarsInfo.mlirVars))) {
+ auto [blockArg, privDecl, mlirPrivVar] = zip;
+ // This is handled before the task executes
+ if (privDecl.readsFromMold())
+ continue;
+
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ llvm::Type *llvmAllocType =
+ moduleTranslation.convertType(privDecl.getType());
+ builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+ llvm::Value *llvmPrivateVar = builder.CreateAlloca(
+ llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
+
+ llvm::Expected<llvm::Value *> privateVarOrError =
+ initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
+ blockArg, llvmPrivateVar, privInitBlock);
+ if (!privateVarOrError)
+ return privateVarOrError.takeError();
+ moduleTranslation.mapValue(blockArg, privateVarOrError.get());
+ privateVarsInfo.llvmVars[i] = privateVarOrError.get();
+ // Add private var to llvmFirstPrivateVars
+ llvmFirstPrivateVars[index++] = privateVarOrError.get();
+ }
+
+ taskStructMgr.createGEPsToPrivateVars();
+ for (auto [i, llvmPrivVar] :
+ llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
+ if (!llvmPrivVar) {
+ assert(privateVarsInfo.llvmVars[i] &&
+ "This is added in the loop above");
+ continue;
+ }
+ privateVarsInfo.llvmVars[i] = llvmPrivVar;
+ }
+
+ // Find and map the addresses of each variable within the taskloop context
+ // structure
+ for (auto [blockArg, llvmPrivateVar, privateDecl] :
+ llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
+ privateVarsInfo.privatizers)) {
+ // This was handled above.
+ if (!privateDecl.readsFromMold())
+ continue;
+ // Fix broken pass-by-value case for Fortran character boxes
+ if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ llvmPrivateVar = builder.CreateLoad(
+ moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
+ }
+ assert(llvmPrivateVar->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+ moduleTranslation.mapValue(blockArg, llvmPrivateVar);
+ }
+
+ auto continuationBlockOrError =
+ convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region",
+ builder, moduleTranslation);
+ ;
+ if (failed(handleError(continuationBlockOrError, opInst)))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
+
+ // dummy check to ensure that the task context structure is accessed inside
+ // the outlined fn.
+ llvm::Value *cond = taskStructMgr.isAllocated();
+ return llvm::Error::success();
+ };
+
+ auto loopOp = cast<omp::LoopNestOp>(taskloopOp.getWrappedLoop());
+
+ auto loopInfo = [&]() -> llvm::Expected<llvm::CanonicalLoopInfo *> {
+ llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation);
+ return loopInfo;
+ };
+
+ llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder();
+ llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
+ llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
+ moduleTranslation.getOpenMPBuilder()->createTaskloop(
+ ompLoc, allocaIP, bodyCB, loopInfo,
+ moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]),
+ moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]),
+ moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]));
+
+ if (failed(handleError(afterIP, opInst)))
+ return failure();
+
+ builder.restoreIP(*afterIP);
+
+ // freeing the task context structure in exit block of taskloop.
+ if (failed(cleanupPrivateVars(builder, moduleTranslation, taskloopOp.getLoc(),
+ llvmFirstPrivateVars,
+ privateVarsInfo.privatizers)))
+ return failure();
+
+ taskStructMgr.freeStructPtr();
+
+ return success();
+}
+
/// Converts an OpenMP taskgroup construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder,
@@ -6647,6 +6895,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
.Case([&](omp::TaskOp op) {
return convertOmpTaskOp(op, builder, moduleTranslation);
})
+ .Case([&](omp::TaskloopOp op) {
+ return convertOmpTaskloopOp(*op, builder, moduleTranslation);
+ })
.Case([&](omp::TaskgroupOp op) {
return convertOmpTaskgroupOp(op, builder, moduleTranslation);
})
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
new file mode 100644
index 0000000000000..536a1fe9d9157
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
@@ -0,0 +1,151 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+omp.private {type = private} @_QFtestEi_private_i32 : i32
+
+omp.private {type = firstprivate} @_QFtestEa_firstprivate_i32 : i32 copy {
+^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ %0 = llvm.load %arg0 : !llvm.ptr -> i32
+ llvm.store %0, %arg1 : i32, !llvm.ptr
+ omp.yield(%arg1 : !llvm.ptr)
+}
+
+
+llvm.func @_QPtest() {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+ %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
+ %6 = llvm.mlir.constant(20 : i32) : i32
+ llvm.store %6, %3 : i32, !llvm.ptr
+ %7 = llvm.mlir.constant(1 : i32) : i32
+ %8 = llvm.mlir.constant(5 : i32) : i32
+ %9 = llvm.mlir.constant(1 : i32) : i32
+ omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+ omp.loop_nest (%arg2) : i32 = (%7) to (%8) inclusive step (%9) {
+ llvm.store %arg2, %arg1 : i32, !llvm.ptr
+ %10 = llvm.load %arg0 : !llvm.ptr -> i32
+ %11 = llvm.mlir.constant(1 : i32) : i32
+ %12 = llvm.add %10, %11 : i32
+ llvm.store %12, %arg0 : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// CHECK: %struct.kmp_task_info = type { ptr, ptr, i32, ptr, ptr, i64, i64, i64 }
+
+// CHECK-LABEL: define void @_QPtest() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[VAL1:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_X:.*]] = alloca i32, i64 1, align 4
+// CHECK: store i32 20, ptr %[[VAL_X]], align 4
+// CHECK: br label %entry
+
+// CHECK: entry:
+// CHECK: br label %omp.private.init
+
+// CHECK: omp.private.init: ; preds = %entry
+// CHECK: %[[OMP_TASK_CONTEXT_PTR:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
+// CHECK: %[[PRIV_GEP:.*]] = getelementptr { i32 }, ptr %[[OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
+// CHECK: br label %omp.private.copy
+
+// CHECK: omp.private.copy:
+// CHECK: br label %omp.private.copy1
+
+// CHECK: omp.private.copy1:
+// CHECK: %[[LOAD_X:.*]] = load i32, ptr %[[VAL_X]], align 4
+// CHECK: store i32 %[[LOAD_X]], ptr %[[PRIV_GEP]], align 4
+// CHECK: br label %omp.taskloop.start
+
+// CHECK: omp.taskloop.start:
+// CHECK: br label %codeRepl
+
+// CHECK: codeRepl:
+// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK: store ptr %[[OMP_TASK_CONTEXT_PTR]], ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8
+// CHECK: %[[GTID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]])
+// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par)
+// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5
+// CHECK: store i32 1, ptr %[[LB_GEP]], align 4
+// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6
+// CHECK: store i32 5, ptr %[[UB_GEP]], align 4
+// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7
+// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4
+// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4
+// CHECK: %10 = load ptr, ptr %[[TASK_PTR]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %10, ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
+// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[GTID]], ptr %[[TASK_PTR]], i32 1, ptr %[[LB_GEP]], ptr %[[UB_GEP]], i64 %[[LOAD_STEP]], i32 1, i32 0, i64 0, ptr null)
+// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[GTID]])
+// CHECK: br label %taskloop.exit
+
+// CHECK: taskloop.exit:
+// CHECK: tail call void @free(ptr %[[OMP_TASK_CONTEXT_PTR]])
+// CHECK: ret void
+// CHECK: }
+
+// CHECK-LABEL: define internal void @_QPtest..omp_par
+// CHECK-SAME: i32 %[[GLOBAL_TID:.*]], ptr %[[TASK_PTR1:.*]]) {
+// CHECK: taskloop.alloca:
+// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8
+// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5
+// CHECK: %[[LB:.*]] = load i32, ptr %[[GEP_LB]], align 4
+// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6
+// CHECK: %[[UB:.*]] = load i32, ptr %[[GEP_UB]], align 4
+// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0
+// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1
+// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4
+// CHECK: br label %taskloop.body
+
+// CHECK: taskloop.body:
+// CHECK: %[[LOAD_X:.*]] = getelementptr { i32 }, ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
+// CHECK: br label %omp.taskloop.region
+
+// CHECK: omp.taskloop.region:
+// CHECK: br label %omp_loop.preheader
+
+// CHECK: omp_loop.preheader:
+// CHECK: %[[VAL2:.*]] = sub i32 %[[UB]], %[[LB]]
+// CHECK: %[[TRIP_CNT:.*]] = add i32 %[[VAL2]], 1
+// CHECK: br label %omp_loop.header
+
+// CHECK: omp_loop.header:
+// CHECK: %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
+// CHECK: br label %omp_loop.cond
+
+// CHECK: omp_loop.cond:
+// CHECK: %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[TRIP_CNT]]
+// CHECK: br i1 %[[OMP_LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit
+
+// CHECK: omp_loop.exit:
+// CHECK: br label %omp_loop.after
+
+// CHECK: omp_loop.after:
+// CHECK: br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK: %[[IS_ALLOCATED:.*]] = icmp ne ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], null
+// CHECK: br label %taskloop.exit.exitStub
+
+// CHECK: omp_loop.body:
+// CHECK: %[[VAL3:.*]] = mul i32 %[[OMP_LOOP_IV]], 1
+// CHECK: %[[VAL5:.*]] = add i32 %[[VAL3]], %[[LB]]
+// CHECK: br label %omp.loop_nest.region
+
+// CHECK: omp.loop_nest.region:
+// CHECK: store i32 %[[VAL5]], ptr %[[OMP_PRIVATE_ALLOC]], align 4
+// CHECK: %[[VAL6:.*]] = load i32, ptr %[[LOAD_X]], align 4
+// CHECK: %[[RES:.*]] = add i32 %[[VAL6]], 1
+// CHECK: store i32 %[[RES]], ptr %[[LOAD_X]], align 4
+// CHECK: br label %omp.region.cont2
+
+// CHECK: omp.region.cont2:
+// CHECK: br label %omp_loop.inc
+
+// CHECK: omp_loop.inc:
+// CHECK: %omp_loop.next = add nuw i32 %[[OMP_LOOP_IV]], 1
+// CHECK: br label %omp_loop.header
+
+// CHECK: taskloop.exit.exitStub:
+// CHECK: ret void
+// CHECK: }
\ No newline at end of file
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index d4cc9e215de1d..8dc3b38f67f27 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -321,21 +321,8 @@ llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
// -----
-llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) {
- // expected-error at below {{not yet implemented: omp.taskloop}}
- // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
- omp.taskloop {
- omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
- omp.yield
- }
- }
- llvm.return
-}
-
-// -----
-
llvm.func @taskloop_untied(%lb : i32, %ub : i32, %step : i32) {
- // expected-error at below {{not yet implemented: omp.taskloop}}
+ // expected-error at below {{not yet implemented: Unhandled clause untied in omp.taskloop operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
omp.taskloop untied {
omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
>From b49ac002fd186ee64d9dad96a7d62b0c837d6306 Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <kaviyara2000 at gmail.com>
Date: Mon, 17 Nov 2025 00:27:35 +0530
Subject: [PATCH 02/16] [Flang][OpenMP] Addressed review comments
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 27 +++++++++++--------
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 23 +++++++++++++---
mlir/test/Target/LLVMIR/openmp-taskloop.mlir | 10 ++++---
3 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2d7c50ece7199..b0ba1b507dd3f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2028,7 +2028,7 @@ static Value *emitTaskDependencies(
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB,
- llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> loopInfo,
+ llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) {
if (!updateToLocation(Loc))
@@ -2053,7 +2053,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
return Err;
- llvm::Expected<llvm::CanonicalLoopInfo *> result = loopInfo();
+ llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
if (!result) {
return result.takeError();
}
@@ -2128,12 +2128,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// and set up the lowerbound,upperbound and step values
llvm::Value *lb =
Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5);
- // Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
- Builder.CreateStore(LBVal, lb);
+ Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
+ Builder.CreateStore(LbVal_ext, lb);
llvm::Value *ub =
Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6);
- Builder.CreateStore(UBVal, ub);
+ Value *UbVal_ext = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
+ Builder.CreateStore(UbVal_ext, ub);
llvm::Value *step =
Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7);
@@ -2155,6 +2156,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *NoGroup = Builder.getInt32(1);
Value *Sched = Builder.getInt32(0);
Value *GrainSize = Builder.getInt64(0);
+
+ // TODO: Handle the case when TaskDup pointer isn't empty
Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub,
@@ -2184,13 +2187,15 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Type *IVTy = IV->getType();
Constant *One = ConstantInt::get(IVTy, 1);
- Value *task_lb = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 5, "gep_lb");
- Value *LowerBound = Builder.CreateLoad(IVTy, task_lb, "lb");
+ Value *TaskLB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 5, "gep_lb");
+ Value *LoadTaskLB = Builder.CreateLoad(Builder.getInt64Ty(), TaskLB);
+ Value *LowerBound = Builder.CreateTrunc(LoadTaskLB, IVTy, "lb");
- Value *task_ub = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 6, "gep_ub");
- Value *UpperBound = Builder.CreateLoad(IVTy, task_ub, "ub");
+ Value *TaskUB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
+ OutlinedFn.getArg(1), 6, "gep_ub");
+ Value *LoadTaskUB = Builder.CreateLoad(Builder.getInt64Ty(), TaskUB);
+ Value *UpperBound = Builder.CreateTrunc(LoadTaskUB, IVTy, "ub");
Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index dac63dccb7a3c..e75df77f0f910 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -345,6 +345,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
result = todo("cancel directive inside of taskloop");
}
};
+ auto checkCollapse = [&todo](auto op, LogicalResult &result) {
+ if (op.getCollapseNumLoops() > 1)
+ result = todo("collapse");
+ };
auto checkDepend = [&todo](auto op, LogicalResult &result) {
if (!op.getDependVars().empty() || op.getDependKinds())
result = todo("depend");
@@ -361,9 +365,9 @@ static LogicalResult checkImplementationStatus(Operation &op) {
if (op.getGrainsize())
result = todo("grainsize");
};
- auto checkIf = [](auto op, LogicalResult &) {
+ auto checkIf = [&todo](auto op, LogicalResult &result) {
if (op.getIfExpr())
- op.emitWarning("if");
+ result = todo("if");
};
auto checkMergeable = [&todo](auto op, LogicalResult &result) {
if (op.getMergeable())
@@ -435,6 +439,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
checkAllocate(op, result);
checkOrder(op, result);
})
+ .Case([&](omp::LoopNestOp op) {
+ if (mlir::isa<omp::TaskloopOp>(op.getOperation()->getParentOp()))
+ checkCollapse(op, result);
+ })
.Case([&](omp::OrderedRegionOp op) { checkParLevelSimd(op, result); })
.Case([&](omp::SectionsOp op) {
checkAllocate(op, result);
@@ -2735,7 +2743,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// dummy check to ensure that the task context structure is accessed inside
// the outlined fn.
- llvm::Value *cond = taskStructMgr.isAllocated();
+ [[maybe_unused]] llvm::Value *cond = taskStructMgr.isAllocated();
return llvm::Error::success();
};
@@ -2746,7 +2754,6 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return loopInfo;
};
- llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder();
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
moduleTranslation.getOpenMPBuilder()->createTaskloop(
@@ -2766,6 +2773,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
privateVarsInfo.privatizers)))
return failure();
+ // Note: This free is valid because end_taskgroup waits until all generated
+ // tasks are complete before returning. In the presence of Nogroup clause,
+ // @__kmpc_taskgroup(..)/@__kmpc_end_taskgroup(..) is not called, have to
+ // ensure that this freeStructPtr() is not called until every thread has
+ // completed execution
taskStructMgr.freeStructPtr();
return success();
@@ -3422,6 +3434,9 @@ convertOmpLoopNest(Operation &opInst, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
auto loopOp = cast<omp::LoopNestOp>(opInst);
+ if (failed(checkImplementationStatus(opInst)))
+ return failure();
+
// Set up the source location value for OpenMP runtime.
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
index 536a1fe9d9157..8179784a47d90 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
@@ -67,9 +67,9 @@ llvm.func @_QPtest() {
// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]])
// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par)
// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5
-// CHECK: store i32 1, ptr %[[LB_GEP]], align 4
+// CHECK: store i64 1, ptr %[[LB_GEP]], align 4
// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6
-// CHECK: store i32 5, ptr %[[UB_GEP]], align 4
+// CHECK: store i64 5, ptr %[[UB_GEP]], align 4
// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7
// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4
// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4
@@ -89,9 +89,11 @@ llvm.func @_QPtest() {
// CHECK: taskloop.alloca:
// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8
// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5
-// CHECK: %[[LB:.*]] = load i32, ptr %[[GEP_LB]], align 4
+// CHECK: %[[LOAD_LB64:.*]] = load i64, ptr %[[GEP_LB]], align 4
+// CHECK: %[[LB:.*]] = trunc i64 %[[LOAD_LB64]] to i32
// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6
-// CHECK: %[[UB:.*]] = load i32, ptr %[[GEP_UB]], align 4
+// CHECK: %[[LOAD_UB64:.*]] = load i64, ptr %[[GEP_UB]], align 4
+// CHECK: %[[UB:.*]] = trunc i64 %[[LOAD_UB64]] to i32
// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0
// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1
// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4
>From 1eeff8748a38ce13e8eef197baf6e1cb6acc62e6 Mon Sep 17 00:00:00 2001
From: Kaviya Rajendiran <kaviyara2000 at gmail.com>
Date: Mon, 17 Nov 2025 14:14:57 +0530
Subject: [PATCH 03/16] [Flang][OpenMP]Added TODO testcases for taskloop
clauses
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 2 +-
mlir/test/Target/LLVMIR/openmp-todo.mlir | 161 ++++++++++++++++++
2 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e75df77f0f910..4cb8e57a5c78f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -411,7 +411,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
result = todo("privatization");
};
auto checkReduction = [&todo](auto op, LogicalResult &result) {
- if (isa<omp::TeamsOp>(op))
+ if (isa<omp::TeamsOp>(op) || isa<omp::TaskloopOp>(op))
if (!op.getReductionVars().empty() || op.getReductionByref() ||
op.getReductionSyms())
result = todo("reduction");
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 8dc3b38f67f27..cb15ad0b199a1 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -318,6 +318,167 @@ llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) {
}
llvm.return
}
+// -----
+
+llvm.func @taskloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause allocate in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop allocate(%x : !llvm.ptr -> %x : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_collapse(%lb : i32, %ub : i32, %step : i32, %lb1 : i32, %ub1 : i32, %step1 : i32) {
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop {
+ // expected-error at below {{not yet implemented: Unhandled clause collapse in omp.loop_nest operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.loop_nest}}
+ omp.loop_nest (%iv, %iv1) : i32 = (%lb, %lb1) to (%ub, %ub1) inclusive step (%step, %step1) collapse(2) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_final(%lb : i32, %ub : i32, %step : i32, %true : i1) {
+ // expected-error at below {{not yet implemented: Unhandled clause final in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop final(%true) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_grainsize(%lb : i32, %ub : i32, %step : i32, %grainsize : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause grainsize in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop grainsize(%grainsize: i32) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_if(%lb : i32, %ub : i32, %step : i32, %true : i1) {
+ // expected-error at below {{not yet implemented: Unhandled clause if in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop if(%true) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+ omp.declare_reduction @add_reduction_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+ }combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ }
+
+llvm.func @taskloop_inreduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause in_reduction in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop in_reduction(@add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_mergeable(%lb : i32, %ub : i32, %step : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause mergeable in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop mergeable {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_nogroup(%lb : i32, %ub : i32, %step : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause nogroup in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop nogroup {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_num_tasks(%lb : i32, %ub : i32, %step : i32, %numtasks : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause num_tasks in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop num_tasks(%numtasks: i32) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+
+llvm.func @taskloop_priority(%lb : i32, %ub : i32, %step : i32, %priority : i32) {
+ // expected-error at below {{not yet implemented: Unhandled clause priority in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop priority(%priority: i32) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
+
+// -----
+ omp.declare_reduction @add_reduction_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+ }combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ }
+
+llvm.func @taskloop_reduction(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
+ // expected-error at below {{not yet implemented: Unhandled clause reduction in omp.taskloop operation}}
+ // expected-error at below {{LLVM Translation failed for operation: omp.taskloop}}
+ omp.taskloop reduction(@add_reduction_i32 %x -> %arg0 : !llvm.ptr) {
+ omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+ omp.yield
+ }
+ }
+ llvm.return
+}
// -----
>From 444a746b52b5802083a907b2a36c1ef09a6a988d Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Tue, 2 Dec 2025 08:47:54 +0000
Subject: [PATCH 04/16] [Flang][OpenMP] Rework bounds in Taskloop lowering
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 137 +++++++++++++---------
1 file changed, 79 insertions(+), 58 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b0ba1b507dd3f..848ac0e4a7987 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2065,10 +2065,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
OI.ExitBB = TaskloopExitBB;
// Add the thread ID argument.
- SmallVector<Instruction *, 4> ToBeDeleted;
+ SmallVector<Instruction *> ToBeDeleted;
// dummy instruction to be used as a fake argument
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
+ createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
+ "global.lb", false);
+ createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
+ "global.ub", false);
+ createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
+ "global.step", false);
OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
TaskloopAllocaBB, CLI, Loc,
@@ -2078,9 +2084,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
- // HasShareds is true if any variables are captured in the outlined region,
- // false otherwise.
- bool HasShareds = StaleCI->arg_size() > 1;
Builder.SetInsertPoint(StaleCI);
// Gather the arguments for emitting the runtime call for
@@ -2101,20 +2104,17 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *TaskSize = Builder.getInt64(
divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8));
- Value *SharedsSize = Builder.getInt64(0);
- if (HasShareds) {
- AllocaInst *ArgStructAlloca =
- dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
- assert(ArgStructAlloca &&
- "Unable to find the alloca instruction corresponding to arguments "
- "for extracted function");
- StructType *ArgStructType =
- dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
- assert(ArgStructType && "Unable to find struct type corresponding to "
- "arguments for extracted function");
- SharedsSize =
- Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
- }
+ AllocaInst *ArgStructAlloca =
+ dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
+ assert(ArgStructAlloca &&
+ "Unable to find the alloca instruction corresponding to arguments "
+ "for extracted function");
+ StructType *ArgStructType =
+ dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
+ assert(ArgStructType && "Unable to find struct type corresponding to "
+ "arguments for extracted function");
+ Value *SharedsSize =
+ Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
// Emit the @__kmpc_omp_task_alloc runtime call
// The runtime call returns a pointer to an area where the task captured
@@ -2124,31 +2124,25 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
/*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
/*task_func=*/&OutlinedFn});
+ Value *Shareds = StaleCI->getArgOperand(1);
+ Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
+ Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
+ Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
+ SharedsSize);
// Get the pointer to loop lb, ub, step from task ptr
// and set up the lowerbound,upperbound and step values
- llvm::Value *lb =
- Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5);
- Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
- Builder.CreateStore(LbVal_ext, lb);
-
- llvm::Value *ub =
- Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6);
- Value *UbVal_ext = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
- Builder.CreateStore(UbVal_ext, ub);
-
- llvm::Value *step =
- Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7);
- Value *Step_ext = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
- Builder.CreateStore(Step_ext, step);
- llvm::Value *loadstep = Builder.CreateLoad(Builder.getInt64Ty(), step);
+ llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0);
+ Value *LbValExt = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
+ Builder.CreateStore(LbValExt, Lb);
- if (HasShareds) {
- Value *Shareds = StaleCI->getArgOperand(1);
- Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
- Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
- Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
- SharedsSize);
- }
+ llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1);
+ Value *UbValExt = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
+ Builder.CreateStore(UbValExt, Ub);
+
+ llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2);
+ Value *StepExt = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
+ Builder.CreateStore(StepExt, Step);
+ llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
// set up the arguments for emitting kmpc_taskloop runtime call
// setting default values for ifval, nogroup, sched, grainsize, task_dup
@@ -2160,8 +2154,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// TODO: Handle the case when TaskDup pointer isn't empty
Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
- Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub,
- loadstep, NoGroup, Sched, GrainSize, TaskDup};
+ Value *Args[] = {Ident, ThreadID, TaskData, IfVal, Lb, Ub,
+ Loadstep, NoGroup, Sched, GrainSize, TaskDup};
// taskloop runtime call
Function *TaskloopFn =
@@ -2177,29 +2171,53 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
- if (HasShareds) {
- LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
- OutlinedFn.getArg(1)->replaceUsesWithIf(
- Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
- }
+ LoadInst *SharedsOutlined =
+ Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
+ OutlinedFn.getArg(1)->replaceUsesWithIf(
+ SharedsOutlined,
+ [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
Value *IV = CLI->getIndVar();
Type *IVTy = IV->getType();
Constant *One = ConstantInt::get(IVTy, 1);
- Value *TaskLB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 5, "gep_lb");
- Value *LoadTaskLB = Builder.CreateLoad(Builder.getInt64Ty(), TaskLB);
- Value *LowerBound = Builder.CreateTrunc(LoadTaskLB, IVTy, "lb");
-
- Value *TaskUB = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop,
- OutlinedFn.getArg(1), 6, "gep_ub");
- Value *LoadTaskUB = Builder.CreateLoad(Builder.getInt64Ty(), TaskUB);
- Value *UpperBound = Builder.CreateTrunc(LoadTaskUB, IVTy, "ub");
+ // When outlining, CodeExtractor will create GEP's to the LowerBound and
+ // UpperBound. These GEP's can be reused for loading the tasks respective
+ // bounds.
+ Value *TaskLB = nullptr;
+ Value *TaskUB = nullptr;
+ Value *LoadTaskLB = nullptr;
+ Value *LoadTaskUB = nullptr;
+ for (Instruction &I : *TaskloopAllocaBB) {
+ if (I.getOpcode() == Instruction::GetElementPtr) {
+ GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
+ switch (CI->getZExtValue()) {
+ case 0:
+ TaskLB = &I;
+ break;
+ case 1:
+ TaskUB = &I;
+ break;
+ }
+ }
+ } else if (I.getOpcode() == Instruction::Load) {
+ LoadInst &Load = cast<LoadInst>(I);
+ if (Load.getPointerOperand() == TaskLB) {
+ assert(TaskLB != nullptr && "Expected value for TaskLB");
+ LoadTaskLB = &I;
+ } else if (Load.getPointerOperand() == TaskUB) {
+ assert(TaskUB != nullptr && "Expected value for TaskUB");
+ LoadTaskUB = &I;
+ }
+ }
+ }
Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
- Value *TripCountMinusOne = Builder.CreateSub(UpperBound, LowerBound);
+ assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
+ assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
+ Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
// set the trip count in the CLI
CLI->setTripCount(TripCount);
@@ -2213,13 +2231,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
if (Add->getOpcode() == llvm::Instruction::Add) {
if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
// update the starting index of the loop
- Add->setOperand(1, LowerBound);
+ Add->setOperand(1, LoadTaskLB);
}
}
}
}
for (Instruction *I : llvm::reverse(ToBeDeleted)) {
+ while (!I->use_empty()) {
+ I->user_back()->eraseFromParent();
+ }
I->eraseFromParent();
}
};
>From 58bd07685ee013024eb560ab55bda566df9e5bcf Mon Sep 17 00:00:00 2001
From: Jack Styles <jack.styles at arm.com>
Date: Wed, 17 Dec 2025 11:42:46 +0000
Subject: [PATCH 05/16] Updates to bounds rework
- Force the first 3 entries to the StructArg to be the bounds info
- Ensure it will work when executing the tasks in parallel
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 2 +
.../llvm/Transforms/Utils/CodeExtractor.h | 4 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 76 ++++++++++++-------
3 files changed, 51 insertions(+), 31 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 88b698bc71874..ae69f2fcd5cdc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -14,6 +14,7 @@
#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
+#include "llvm/ADT/SetVector.h"
#include "llvm/Frontend/Atomic/Atomic.h"
#include "llvm/Frontend/OpenMP/OMPConstants.h"
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
@@ -2366,6 +2367,7 @@ class OpenMPIRBuilder {
PostOutlineCBTy PostOutlineCB;
BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
SmallVector<Value *, 2> ExcludeArgsFromAggregate;
+ SetVector<Value *> Inputs, Outputs;
// TODO: this should be safe to enable by default
bool FixUpNonEntryAllocas = false;
diff --git a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
index 407eb50d2c7a3..3e2c69b47bc48 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeExtractor.h
@@ -171,9 +171,9 @@ class CodeExtractorAnalysisCache {
///
/// \param CEAC - Cache to speed up operations for the CodeExtractor when
/// hoisting, and extracting lifetime values and assumes.
- /// \param Inputs [out] - filled with values marked as inputs to the
+ /// \param Inputs [in/out] - filled with values marked as inputs to the
/// newly outlined function.
- /// \param Outputs [out] - filled with values marked as outputs to the
+ /// \param Outputs [out] - filled with values marked as outputs to the
/// newly outlined function.
/// \returns zero when called on a CodeExtractor instance where isEligible
/// returns false.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 848ac0e4a7987..2088abd3e8c3e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -403,18 +403,19 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
llvm::SmallVectorImpl<Instruction *> &ToBeDeleted,
OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
- const Twine &Name = "", bool AsPtr = true) {
+ const Twine &Name = "", bool AsPtr = true,
+ bool Is64Bit = false) {
Builder.restoreIP(OuterAllocaIP);
+ IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
Instruction *FakeVal;
AllocaInst *FakeValAddr =
- Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
+ Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
ToBeDeleted.push_back(FakeValAddr);
if (AsPtr) {
FakeVal = FakeValAddr;
} else {
- FakeVal =
- Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
+ FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
ToBeDeleted.push_back(FakeVal);
}
@@ -422,11 +423,10 @@ Value *createFakeIntVal(IRBuilderBase &Builder,
Builder.restoreIP(InnerAllocaIP);
Instruction *UseFakeVal;
if (AsPtr) {
- UseFakeVal =
- Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
+ UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
} else {
- UseFakeVal =
- cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
+ UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
+ FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
}
ToBeDeleted.push_back(UseFakeVal);
return FakeVal;
@@ -830,7 +830,8 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
for (auto *V : OI.ExcludeArgsFromAggregate)
Extractor.excludeArgFromAggregate(V);
- Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
+ Function *OutlinedFn =
+ Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
// Forward target-cpu, target-features attributes to the outlined function.
auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
@@ -2069,21 +2070,39 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// dummy instruction to be used as a fake argument
OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
- createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
- "global.lb", false);
- createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
- "global.ub", false);
- createFakeIntVal(Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP,
- "global.step", false);
+ Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
+ TaskloopAllocaIP, "lb", false, true);
+ Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
+ TaskloopAllocaIP, "ub", false, true);
+ Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
+ TaskloopAllocaIP, "step", false, true);
+ /* For Taskloop, we want to force the bounds being the first 3 inputs in the
+ * aggregate struct*/
+ OI.Inputs.insert(FakeLB);
+ OI.Inputs.insert(FakeUB);
+ OI.Inputs.insert(FakeStep);
OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
- TaskloopAllocaBB, CLI, Loc,
- ToBeDeleted](Function &OutlinedFn) mutable {
+ TaskloopAllocaBB, CLI, Loc, ToBeDeleted, FakeLB, FakeUB,
+ FakeStep](Function &OutlinedFn) mutable {
// Replace the Stale CI by appropriate RTL function call.
assert(OutlinedFn.hasOneUse() &&
"there must be a single user for the outlined function");
CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
+ /* Create the casting for the Bounds Values that can be used when outlining
+ * to replace the uses of the fakes with real values */
+ BasicBlock *CodeReplBB = StaleCI->getParent();
+ IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
+ Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
+ Value *CastedLBVal =
+ Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
+ Value *CastedUBVal =
+ Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
+ Value *CastedStepVal =
+ Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
+ Builder.restoreIP(CurrentIp);
+
Builder.SetInsertPoint(StaleCI);
// Gather the arguments for emitting the runtime call for
@@ -2132,16 +2151,13 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
// Get the pointer to loop lb, ub, step from task ptr
// and set up the lowerbound,upperbound and step values
llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0);
- Value *LbValExt = Builder.CreateSExt(LBVal, Builder.getInt64Ty());
- Builder.CreateStore(LbValExt, Lb);
+ Builder.CreateStore(CastedLBVal, Lb);
llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1);
- Value *UbValExt = Builder.CreateSExt(UBVal, Builder.getInt64Ty());
- Builder.CreateStore(UbValExt, Ub);
+ Builder.CreateStore(CastedUBVal, Ub);
llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2);
- Value *StepExt = Builder.CreateSExt(StepVal, Builder.getInt64Ty());
- Builder.CreateStore(StepExt, Step);
+ Builder.CreateStore(CastedStepVal, Step);
llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
// set up the arguments for emitting kmpc_taskloop runtime call
@@ -2179,7 +2195,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *IV = CLI->getIndVar();
Type *IVTy = IV->getType();
- Constant *One = ConstantInt::get(IVTy, 1);
+ Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
// When outlining, CodeExtractor will create GEP's to the LowerBound and
// UpperBound. These GEP's can be reused for loading the tasks respective
@@ -2219,8 +2235,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
+ Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
+ Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
// set the trip count in the CLI
- CLI->setTripCount(TripCount);
+ CLI->setTripCount(CastedTripCount);
Builder.SetInsertPoint(CLI->getBody(),
CLI->getBody()->getFirstInsertionPt());
@@ -2231,16 +2249,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
if (Add->getOpcode() == llvm::Instruction::Add) {
if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
// update the starting index of the loop
- Add->setOperand(1, LoadTaskLB);
+ Add->setOperand(1, CastedTaskLB);
}
}
}
}
+ FakeLB->replaceAllUsesWith(CastedLBVal);
+ FakeUB->replaceAllUsesWith(CastedUBVal);
+ FakeStep->replaceAllUsesWith(CastedStepVal);
for (Instruction *I : llvm::reverse(ToBeDeleted)) {
- while (!I->use_empty()) {
- I->user_back()->eraseFromParent();
- }
I->eraseFromParent();
}
};
>From f3df9bee3f6f692b5419e227f68aea84b604103d Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Fri, 2 Jan 2026 15:47:52 +0000
Subject: [PATCH 06/16] kaviya's review comments
Comments at https://github.com/Stylie777/llvm-project/pull/3
---
.../include/llvm/Frontend/OpenMP/OMPKinds.def | 1 -
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 50 +++++++++++++------
2 files changed, 34 insertions(+), 17 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index bb12c1558766b..152a8f727310a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,7 +95,6 @@ __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidP
__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
__OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
__OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
-__OMP_STRUCT_TYPE(Taskloop, kmp_task_info, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr, Int64, Int64, Int64)
__OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
Int8, Int8, Int8, Int32, Int32, Int32, Int32, Int32, Int32)
__OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2088abd3e8c3e..42061e428189c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2076,8 +2076,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
TaskloopAllocaIP, "ub", false, true);
Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
TaskloopAllocaIP, "step", false, true);
- /* For Taskloop, we want to force the bounds being the first 3 inputs in the
- * aggregate struct*/
+ // For Taskloop, we want to force the bounds being the first 3 inputs in the
+ // aggregate struct
OI.Inputs.insert(FakeLB);
OI.Inputs.insert(FakeUB);
OI.Inputs.insert(FakeStep);
@@ -2121,7 +2121,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *Flags = Builder.getInt32(Tied);
Value *TaskSize = Builder.getInt64(
- divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8));
+ divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
AllocaInst *ArgStructAlloca =
dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
@@ -2150,14 +2150,14 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
SharedsSize);
// Get the pointer to loop lb, ub, step from task ptr
// and set up the lowerbound,upperbound and step values
- llvm::Value *Lb = Builder.CreateStructGEP(ArgStructType, TaskShareds, 0);
- Builder.CreateStore(CastedLBVal, Lb);
+ llvm::Value *Lb = Builder.CreateGEP(
+ ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
- llvm::Value *Ub = Builder.CreateStructGEP(ArgStructType, TaskShareds, 1);
- Builder.CreateStore(CastedUBVal, Ub);
+ llvm::Value *Ub = Builder.CreateGEP(
+ ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
- llvm::Value *Step = Builder.CreateStructGEP(ArgStructType, TaskShareds, 2);
- Builder.CreateStore(CastedStepVal, Step);
+ llvm::Value *Step = Builder.CreateGEP(
+ ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
// set up the arguments for emitting kmpc_taskloop runtime call
@@ -2243,13 +2243,31 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Builder.SetInsertPoint(CLI->getBody(),
CLI->getBody()->getFirstInsertionPt());
- llvm::BasicBlock *Body = CLI->getBody();
- for (llvm::Instruction &I : *Body) {
- if (auto *Add = llvm::dyn_cast<llvm::BinaryOperator>(&I)) {
- if (Add->getOpcode() == llvm::Instruction::Add) {
- if (llvm::isa<llvm::BinaryOperator>(Add->getOperand(0))) {
- // update the starting index of the loop
- Add->setOperand(1, CastedTaskLB);
+ // The canonical loop is generated with a fixed lower bound. We need to
+ // update the index calculation code to use the task's lower bound. The
+ // generated code looks like this:
+ // %omp_loop.iv = phi ...
+ // ...
+ // %tmp = mul [type] %omp_loop.iv, step
+ // %user_index = add [type] tmp, lb
+ // OpenMPIRBuilder constructs canonical loops to have exactly three uses of
+ // the normalised induction variable:
+ // 1. This one: converting the normalised IV to the user IV
+ // 2. The increment (add)
+ // 3. The comparison against the trip count (icmp)
+ // (1) is the only use that is a mul followed by an add so this cannot match
+ // other IR.
+ assert(CLI->getIndVar()->getNumUses() == 3 &&
+ "Canonical loop should have exactly three uses of the ind var");
+ for (User *IVUser : CLI->getIndVar()->users()) {
+ if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
+ if (Mul->getOpcode() == Instruction::Mul) {
+ for (User *MulUser : Mul->users()) {
+ if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
+ if (Add->getOpcode() == Instruction::Add) {
+ Add->setOperand(1, CastedTaskLB);
+ }
+ }
}
}
}
>From 2c35825087264279982bc360b1706bba71abd44c Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Mon, 5 Jan 2026 10:10:29 +0000
Subject: [PATCH 07/16] [NFC] Refine private var init/copy interfaces
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 60 ++++++++++++++-----
1 file changed, 44 insertions(+), 16 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4cb8e57a5c78f..a38deca29fd0f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1634,18 +1634,17 @@ findAssociatedValue(Value privateVar, llvm::IRBuilderBase &builder,
/// allocateAndInitPrivateVars instead of this.
/// This returns the private variable which has been initialized. This
/// variable should be mapped before constructing the body of the Op.
-static llvm::Expected<llvm::Value *> initPrivateVar(
- llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
- omp::PrivateClauseOp &privDecl, Value mlirPrivVar, BlockArgument &blockArg,
- llvm::Value *llvmPrivateVar, llvm::BasicBlock *privInitBlock,
- llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+static llvm::Expected<llvm::Value *>
+initPrivateVar(llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ omp::PrivateClauseOp &privDecl, llvm::Value *nonPrivateVar,
+ BlockArgument &blockArg, llvm::Value *llvmPrivateVar,
+ llvm::BasicBlock *privInitBlock,
+ llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
Region &initRegion = privDecl.getInitRegion();
if (initRegion.empty())
return llvmPrivateVar;
- // map initialization region block arguments
- llvm::Value *nonPrivateVar = findAssociatedValue(
- mlirPrivVar, builder, moduleTranslation, mappedPrivateVars);
assert(nonPrivateVar);
moduleTranslation.mapValue(privDecl.getInitMoldArg(), nonPrivateVar);
moduleTranslation.mapValue(privDecl.getInitPrivateArg(), llvmPrivateVar);
@@ -1670,6 +1669,19 @@ static llvm::Expected<llvm::Value *> initPrivateVar(
return phis[0];
}
+/// Version of initPrivateVar which looks up the nonPrivateVar from mlirPrivVar.
+static llvm::Expected<llvm::Value *> initPrivateVar(
+ llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
+ omp::PrivateClauseOp &privDecl, Value mlirPrivVar, BlockArgument &blockArg,
+ llvm::Value *llvmPrivateVar, llvm::BasicBlock *privInitBlock,
+ llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+ return initPrivateVar(
+ builder, moduleTranslation, privDecl,
+ findAssociatedValue(mlirPrivVar, builder, moduleTranslation,
+ mappedPrivateVars),
+ blockArg, llvmPrivateVar, privInitBlock, mappedPrivateVars);
+}
+
static llvm::Error
initPrivateVars(llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
@@ -1775,7 +1787,7 @@ static bool opIsInSingleThread(mlir::Operation *op) {
static LogicalResult copyFirstPrivateVars(
mlir::Operation *op, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation,
- SmallVectorImpl<mlir::Value> &mlirPrivateVars,
+ SmallVectorImpl<llvm::Value *> &moldVars,
ArrayRef<llvm::Value *> llvmPrivateVars,
SmallVectorImpl<omp::PrivateClauseOp> &privateDecls, bool insertBarrier,
llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
@@ -1793,19 +1805,15 @@ static LogicalResult copyFirstPrivateVars(
splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
setInsertPointForPossiblyEmptyBlock(builder, copyBlock);
- for (auto [decl, mlirVar, llvmVar] :
- llvm::zip_equal(privateDecls, mlirPrivateVars, llvmPrivateVars)) {
+ for (auto [decl, moldVar, llvmVar] :
+ llvm::zip_equal(privateDecls, moldVars, llvmPrivateVars)) {
if (decl.getDataSharingType() != omp::DataSharingClauseType::FirstPrivate)
continue;
// copyRegion implements `lhs = rhs`
Region ©Region = decl.getCopyRegion();
- // map copyRegion rhs arg
- llvm::Value *nonPrivateVar = findAssociatedValue(
- mlirVar, builder, moduleTranslation, mappedPrivateVars);
- assert(nonPrivateVar);
- moduleTranslation.mapValue(decl.getCopyMoldArg(), nonPrivateVar);
+ moduleTranslation.mapValue(decl.getCopyMoldArg(), moldVar);
// map copyRegion lhs arg
moduleTranslation.mapValue(decl.getCopyPrivateArg(), llvmVar);
@@ -1836,6 +1844,26 @@ static LogicalResult copyFirstPrivateVars(
return success();
}
+static LogicalResult copyFirstPrivateVars(
+ mlir::Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation,
+ SmallVectorImpl<mlir::Value> &mlirPrivateVars,
+ ArrayRef<llvm::Value *> llvmPrivateVars,
+ SmallVectorImpl<omp::PrivateClauseOp> &privateDecls, bool insertBarrier,
+ llvm::DenseMap<Value, Value> *mappedPrivateVars = nullptr) {
+ llvm::SmallVector<llvm::Value *> moldVars(mlirPrivateVars.size());
+ llvm::transform(mlirPrivateVars, moldVars.begin(), [&](mlir::Value mlirVar) {
+ // map copyRegion rhs arg
+ llvm::Value *moldVar = findAssociatedValue(
+ mlirVar, builder, moduleTranslation, mappedPrivateVars);
+ assert(moldVar);
+ return moldVar;
+ });
+ return copyFirstPrivateVars(op, builder, moduleTranslation, moldVars,
+ llvmPrivateVars, privateDecls, insertBarrier,
+ mappedPrivateVars);
+}
+
static LogicalResult
cleanupPrivateVars(llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation, Location loc,
>From fcd7b8e8d649520d1e41dbd0b8d1540573439065 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Mon, 5 Jan 2026 10:10:51 +0000
Subject: [PATCH 08/16] Task duplication function generation
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 61 +++++++++-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 100 ++++++++++++++-
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 115 ++++++++++++++++--
3 files changed, 258 insertions(+), 18 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index ae69f2fcd5cdc..d720a1b457d56 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -649,6 +649,38 @@ class OpenMPIRBuilder {
using BodyGenCallbackTy =
function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
+ /// Callback type for task duplication function code generation. This is the
+ /// task duplication function passed to __kmpc_taskloop. It is expected that
+ /// this function will set up (first)private variables in the duplicated task
+ /// which have non-trivial (copy-)constructors. Insertion points are handled
+ /// the same way as for BodyGenCallbackTy.
+ ///
+ /// \ref createTaskloop lays out the task's auxiliary data structure as:
+ /// `{ lower bound, upper bound, step, data... }`. DestPtr and SrcPtr point
+ /// to this data.
+ ///
+ /// It is acceptable for the callback to be set to nullptr. In that case no
+ /// function will be generated and nullptr will be passed as the task
+ /// duplication function to __kmpc_taskloop.
+ ///
+ /// \param AllocaIP is the insertion point at which new alloca instructions
+ /// should be placed. The BasicBlock it is pointing to must
+ /// not be split.
+ /// \param CodeGenIP is the insertion point at which the body code should be
+ /// placed.
+ /// \param DestPtr This is a pointer to data inside the newly duplicated
+ /// task's auxiliary data structure (allocated after the task
+ /// descriptor.)
+ /// \param SrcPtr This is a pointer to data inside the original task's
+ /// auxiliary data structure (allocated after the task
+ /// descriptor.)
+ ///
+ /// \return The insertion point immediately after the generated code, or an
+ /// error if any occured.
+ using TaskDupCallbackTy = function_ref<Expected<InsertPointTy>(
+ InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr,
+ Value *SrcPtr)>;
+
// This is created primarily for sections construct as llvm::function_ref
// (BodyGenCallbackTy) is not storable (as described in the comments of
// function_ref class - function_ref contains non-ownable reference
@@ -1219,6 +1251,26 @@ class OpenMPIRBuilder {
LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
const Twine &NamePrefix = "");
+ /// Creates a task duplication function to be passed to kmpc_taskloop.
+ ///
+ /// The OpenMP runtime defines this function as taking the destination
+ /// kmp_task_t, source kmp_task_t, and a lastprivate flag. This function is
+ /// called on the source and destination tasks after the source task has been
+ /// duplicated to create the destination task. At this point the destination
+ /// task has been otherwise set up from the runtime's perspective, but this
+ /// function is needed to fix up any data for the duplicated task e.g. private
+ /// variables with non-trivial constructors.
+ ///
+ /// \param PrivatesTy The type of the privates structure for the task.
+ /// \param PrivatesIndex The index inside the privates structure containing
+ /// the data for the callback.
+ /// \param DupCB The callback to generate the duplication code. See
+ /// documentation for \ref TaskDupCallbackTy. This can be
+ /// nullptr.
+ Expected<Value *> createTaskDuplicationFunction(Type *PrivatesTy,
+ int32_t PrivatesIndex,
+ TaskDupCallbackTy DupCB);
+
public:
/// Modifies the canonical loop to be a workshare loop.
///
@@ -1415,11 +1467,18 @@ class OpenMPIRBuilder {
/// \param UBVal Upperbound value of loop
/// \param StepVal Step value of loop
/// \param Tied True if the task is tied, false if the task is untied.
+ /// \param DupCB The callback to generate the duplication code. See
+ /// documentation for \ref TaskDupCallbackTy. This can be nullptr.
+ /// \param TaskContextStructPtrVal If non-null, a pointer to to be placed
+ /// immediately after the {lower bound, upper
+ /// bound, step} values in the task data.
LLVM_ABI InsertPointOrErrorTy createTaskloop(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB,
llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
- Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true);
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true,
+ TaskDupCallbackTy DupCB = nullptr,
+ Value *TaskContextStructPtrVal = nullptr);
/// Generator for `#omp task`
///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 42061e428189c..b89d22a712569 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -50,6 +50,7 @@
#include "llvm/IR/Value.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/VirtualFileSystem.h"
@@ -2026,11 +2027,77 @@ static Value *emitTaskDependencies(
return DepArray;
}
+/// Create the task duplication function passed to kmpc_taskloop.
+Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
+ Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
+ unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
+ if (!DupCB)
+ return Constant::getNullValue(
+ PointerType::get(Builder.getContext(), ProgramAddressSpace));
+
+ // From OpenMP Runtime p_task_dup_t:
+ // Routine optionally generated by the compiler for setting the lastprivate
+ // flag and calling needed constructors for private/firstprivate objects (used
+ // to form taskloop tasks from pattern task) Parameters: dest task, src task,
+ // lastprivate flag.
+ // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+ auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
+
+ FunctionType *DupFuncTy = FunctionType::get(
+ Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
+ /*isVarArg=*/false);
+
+ Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
+ "omp_taskloop_dup", M);
+ Value *DestTaskArg = DupFunction->getArg(0);
+ Value *SrcTaskArg = DupFunction->getArg(1);
+ Value *LastprivateFlagArg = DupFunction->getArg(2);
+ DestTaskArg->setName("dest_task");
+ SrcTaskArg->setName("src_task");
+ LastprivateFlagArg->setName("lastprivate_flag");
+
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(
+ BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
+
+ auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
+ Type *TaskWithPrivatesTy =
+ StructType::get(Builder.getContext(), {Task, PrivatesTy});
+ Value *TaskPrivates = Builder.CreateGEP(
+ TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
+ Value *ContextPtr = Builder.CreateGEP(
+ PrivatesTy, TaskPrivates,
+ {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
+ return ContextPtr;
+ };
+
+ Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
+ Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
+
+ DestTaskContextPtr->setName("destPtr");
+ SrcTaskContextPtr->setName("srcPtr");
+
+ InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
+ DupFunction->getEntryBlock().begin());
+ InsertPointTy CodeGenIP = Builder.saveIP();
+ Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
+ DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
+ if (!AfterIPOrError)
+ return AfterIPOrError.takeError();
+ Builder.restoreIP(*AfterIPOrError);
+
+ Builder.CreateRetVoid();
+
+ return DupFunction;
+}
+
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
const LocationDescription &Loc, InsertPointTy AllocaIP,
BodyGenCallbackTy BodyGenCB,
llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
- Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) {
+ Value *LBVal, Value *UBVal, Value *StepVal, bool Tied,
+ TaskDupCallbackTy DupCB, Value *TaskContextStructPtrVal) {
if (!updateToLocation(Loc))
return InsertPointTy();
@@ -2081,10 +2148,33 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
OI.Inputs.insert(FakeLB);
OI.Inputs.insert(FakeUB);
OI.Inputs.insert(FakeStep);
+ if (TaskContextStructPtrVal)
+ OI.Inputs.insert(TaskContextStructPtrVal);
+ assert(
+ (TaskContextStructPtrVal && DupCB) ||
+ (!TaskContextStructPtrVal && !DupCB) &&
+ "Task context struct ptr and duplication callback must be both set "
+ "or both null");
+
+ // It isn't safe to run the duplication bodygen callback inside the post
+ // outlining callback so this has to be run now before we know the real task
+ // shareds structure type.
+ unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
+ Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
+ Type *FakeSharedsTy = StructType::get(
+ Builder.getContext(),
+ {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
+ Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
+ FakeSharedsTy,
+ /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
+ if (!TaskDupFnOrErr) {
+ return TaskDupFnOrErr.takeError();
+ }
+ Value *TaskDupFn = *TaskDupFnOrErr;
OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
- TaskloopAllocaBB, CLI, Loc, ToBeDeleted, FakeLB, FakeUB,
- FakeStep](Function &OutlinedFn) mutable {
+ TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
+ FakeLB, FakeUB, FakeStep](Function &OutlinedFn) mutable {
// Replace the Stale CI by appropriate RTL function call.
assert(OutlinedFn.hasOneUse() &&
"there must be a single user for the outlined function");
@@ -2166,9 +2256,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
Value *NoGroup = Builder.getInt32(1);
Value *Sched = Builder.getInt32(0);
Value *GrainSize = Builder.getInt64(0);
-
- // TODO: Handle the case when TaskDup pointer isn't empty
- Value *TaskDup = Constant::getNullValue(Builder.getPtrTy());
+ Value *TaskDup = TaskDupFn;
Value *Args[] = {Ident, ThreadID, TaskData, IfVal, Lb, Ub,
Loadstep, NoGroup, Sched, GrainSize, TaskDup};
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index a38deca29fd0f..2ac1fd89d4dd2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2269,6 +2269,13 @@ class TaskContextStructManager {
/// private decls.
void createGEPsToPrivateVars();
+ /// Given the address of the structure, return a GEP for each private variable
+ /// in the structure. Null values are added where private decls were skipped
+ /// so that the ordering continues to match the private decls.
+ /// Must be called after generateTaskContextStruct().
+ SmallVector<llvm::Value *>
+ createGEPsToPrivateVars(llvm::Value *altStructPtr) const;
+
llvm::Value *isAllocated();
/// De-allocate the task context structure.
@@ -2327,28 +2334,36 @@ void TaskContextStructManager::generateTaskContextStruct() {
"omp.task.context_ptr");
}
-void TaskContextStructManager::createGEPsToPrivateVars() {
- if (!structPtr) {
- assert(privateVarTypes.empty());
- return;
- }
+SmallVector<llvm::Value *> TaskContextStructManager::createGEPsToPrivateVars(
+ llvm::Value *altStructPtr) const {
+ assert(!privateVarTypes.empty());
+ SmallVector<llvm::Value *> ret;
// Create GEPs for each struct member
- llvmPrivateVarGEPs.clear();
- llvmPrivateVarGEPs.reserve(privateDecls.size());
+ ret.reserve(privateDecls.size());
llvm::Value *zero = builder.getInt32(0);
unsigned i = 0;
for (auto privDecl : privateDecls) {
if (!privDecl.readsFromMold()) {
// Handle this inside of the task so we don't pass unnessecary vars in
- llvmPrivateVarGEPs.push_back(nullptr);
+ ret.push_back(nullptr);
continue;
}
llvm::Value *iVal = builder.getInt32(i);
- llvm::Value *gep = builder.CreateGEP(structTy, structPtr, {zero, iVal});
- llvmPrivateVarGEPs.push_back(gep);
+ llvm::Value *gep = builder.CreateGEP(structTy, altStructPtr, {zero, iVal});
+ ret.push_back(gep);
i += 1;
}
+ return ret;
+}
+
+void TaskContextStructManager::createGEPsToPrivateVars() {
+ if (!structPtr) {
+ assert(privateVarTypes.empty());
+ return;
+ }
+
+ llvmPrivateVarGEPs = createGEPsToPrivateVars(structPtr);
}
llvm::Value *TaskContextStructManager::isAllocated() {
@@ -2775,6 +2790,79 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return llvm::Error::success();
};
+ // Taskloop divides into an appropriate number of tasks by repeatedly
+ // duplicating the original task. Each time this is done, the task context
+ // structure must be duplicated too.
+ auto taskDupCB = [&](InsertPointTy AllocaIP, InsertPointTy CodegenIP,
+ llvm::Value *destPtr, llvm::Value *srcPtr)
+ -> llvm::Expected<llvm::IRBuilderBase::InsertPoint> {
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ builder.restoreIP(CodegenIP);
+
+ llvm::Type *ptrTy =
+ builder.getPtrTy(srcPtr->getType()->getPointerAddressSpace());
+ llvm::Value *src =
+ builder.CreateLoad(ptrTy, srcPtr, "omp.taskloop.context.src");
+
+ TaskContextStructManager &srcStructMgr = taskStructMgr;
+ TaskContextStructManager destStructMgr(builder, moduleTranslation,
+ privateVarsInfo.privatizers);
+ destStructMgr.generateTaskContextStruct();
+ llvm::Value *dest = destStructMgr.getStructPtr();
+ dest->setName("omp.taskloop.context.dest");
+ builder.CreateStore(dest, destPtr);
+
+ llvm::SmallVector<llvm::Value *> srcGEPs =
+ srcStructMgr.createGEPsToPrivateVars(src);
+ llvm::SmallVector<llvm::Value *> destGEPs =
+ destStructMgr.createGEPsToPrivateVars(dest);
+
+ // Inline init regions.
+ for (auto [privDecl, mold, blockArg, llvmPrivateVarAlloc] :
+ llvm::zip_equal(privateVarsInfo.privatizers, srcGEPs,
+ privateVarsInfo.blockArgs, destGEPs)) {
+ // To be handled inside task body.
+ if (!privDecl.readsFromMold())
+ continue;
+ assert(llvmPrivateVarAlloc &&
+ "reads from mold so shouldn't have been skipped");
+
+ llvm::Expected<llvm::Value *> privateVarOrErr =
+ initPrivateVar(builder, moduleTranslation, privDecl, mold, blockArg,
+ llvmPrivateVarAlloc, builder.GetInsertBlock());
+ if (!privateVarOrErr)
+ return privateVarOrErr.takeError();
+
+ setInsertPointForPossiblyEmptyBlock(builder);
+
+ // TODO: this is a bit of a hack for Fortran character boxes.
+ // Character boxes are passed by value into the init region and then the
+ // initialized character box is yielded by value. Here we need to store
+ // the yielded value into the private allocation, and load the private
+ // allocation to match the type expected by region block arguments.
+ if ((privateVarOrErr.get() != llvmPrivateVarAlloc) &&
+ !mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ builder.CreateStore(privateVarOrErr.get(), llvmPrivateVarAlloc);
+ // Load it so we have the value pointed to by the GEP
+ llvmPrivateVarAlloc = builder.CreateLoad(
+ privateVarOrErr.get()->getType(), llvmPrivateVarAlloc);
+ }
+ assert(llvmPrivateVarAlloc->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+
+ // Mapping blockArg -> llvmPrivateVarAlloc is done inside the body
+ // callback so that OpenMPIRBuilder doesn't try to pass each GEP address
+ // through a stack allocated structure.
+ }
+
+ if (failed(copyFirstPrivateVars(
+ &opInst, builder, moduleTranslation, srcGEPs, destGEPs,
+ privateVarsInfo.privatizers, taskloopOp.getPrivateNeedsBarrier())))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ return builder.saveIP();
+ };
+
auto loopOp = cast<omp::LoopNestOp>(taskloopOp.getWrappedLoop());
auto loopInfo = [&]() -> llvm::Expected<llvm::CanonicalLoopInfo *> {
@@ -2782,13 +2870,18 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return loopInfo;
};
+ llvm::OpenMPIRBuilder::TaskDupCallbackTy taskDupOrNull = nullptr;
+ if (!taskStructMgr.getLLVMPrivateVarGEPs().empty())
+ taskDupOrNull = taskDupCB;
+
llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
moduleTranslation.getOpenMPBuilder()->createTaskloop(
ompLoc, allocaIP, bodyCB, loopInfo,
moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]),
moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]),
- moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]));
+ moduleTranslation.lookupValue(loopOp.getLoopSteps()[0]),
+ /*Tied=*/true, taskDupOrNull, taskStructMgr.getStructPtr());
if (failed(handleError(afterIP, opInst)))
return failure();
>From 2d597d4cbfb5d20f31b3143e8f9f6a6715c11927 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 14:31:02 +0000
Subject: [PATCH 09/16] Fix freeing private vars and context struct
I decided not to fix the TODO about zero iteration taskloops because
this is part of a larger problem affecting similar constructs e.g.
ordinary tasks with an if clause that evaluates to false.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 52 ++++++-------------
1 file changed, 17 insertions(+), 35 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 2ac1fd89d4dd2..85bde195d99eb 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2276,8 +2276,6 @@ class TaskContextStructManager {
SmallVector<llvm::Value *>
createGEPsToPrivateVars(llvm::Value *altStructPtr) const;
- llvm::Value *isAllocated();
-
/// De-allocate the task context structure.
void freeStructPtr();
@@ -2366,26 +2364,13 @@ void TaskContextStructManager::createGEPsToPrivateVars() {
llvmPrivateVarGEPs = createGEPsToPrivateVars(structPtr);
}
-llvm::Value *TaskContextStructManager::isAllocated() {
- if (!structPtr)
- return nullptr;
-
- return builder.CreateIsNotNull(structPtr);
-}
-
void TaskContextStructManager::freeStructPtr() {
if (!structPtr)
return;
llvm::IRBuilderBase::InsertPointGuard guard{builder};
- llvm::BasicBlock *currentBlock = builder.GetInsertBlock();
- if (currentBlock->getTerminator()) {
- // Ensure we don't put the call to free() after the terminator
- builder.SetInsertPoint(currentBlock->getTerminator());
- } else {
- // Insert the call to free() at the end of the current block
- builder.SetInsertPoint(currentBlock);
- }
+ // Ensure we don't put the call to free() after the terminator
+ builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
builder.CreateFree(structPtr);
}
@@ -2659,6 +2644,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// Allocate and initialize private variables
builder.SetInsertPoint(initBlock->getTerminator());
+ // TODO: don't allocate if the loop has zero iterations.
taskStructMgr.generateTaskContextStruct();
taskStructMgr.createGEPsToPrivateVars();
@@ -2778,15 +2764,25 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
auto continuationBlockOrError =
convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region",
builder, moduleTranslation);
- ;
+
if (failed(handleError(continuationBlockOrError, opInst)))
return llvm::make_error<PreviouslyReportedError>();
builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
- // dummy check to ensure that the task context structure is accessed inside
- // the outlined fn.
- [[maybe_unused]] llvm::Value *cond = taskStructMgr.isAllocated();
+ // This is freeing the private variables as mapped inside of the task: these
+ // will be per-task private copies possibly after task duplication. This is
+ // handled transparently by how these are passed to the structure passed
+ // into the outlined function. When the task is duplicated, that structure
+ // is duplicated too.
+ if (failed(cleanupPrivateVars(builder, moduleTranslation,
+ taskloopOp.getLoc(), llvmFirstPrivateVars,
+ privateVarsInfo.privatizers)))
+ return llvm::make_error<PreviouslyReportedError>();
+ // Similarly, the task context structure freed inside the task is the
+ // per-task copy after task duplication.
+ taskStructMgr.freeStructPtr();
+
return llvm::Error::success();
};
@@ -2887,20 +2883,6 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return failure();
builder.restoreIP(*afterIP);
-
- // freeing the task context structure in exit block of taskloop.
- if (failed(cleanupPrivateVars(builder, moduleTranslation, taskloopOp.getLoc(),
- llvmFirstPrivateVars,
- privateVarsInfo.privatizers)))
- return failure();
-
- // Note: This free is valid because end_taskgroup waits until all generated
- // tasks are complete before returning. In the presence of Nogroup clause,
- // @__kmpc_taskgroup(..)/@__kmpc_end_taskgroup(..) is not called, have to
- // ensure that this freeStructPtr() is not called until every thread has
- // completed execution
- taskStructMgr.freeStructPtr();
-
return success();
}
>From 4586cf4d0da13e4b67106ed548dd80add98a9471 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 14:46:26 +0000
Subject: [PATCH 10/16] Fix indexing not to re-order private vars
This is important so that the private var cleanup applies the right
cleanup region to the right variable.
---
.../Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 85bde195d99eb..b08d72da63aa1 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2649,12 +2649,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
taskStructMgr.createGEPsToPrivateVars();
llvmFirstPrivateVars.resize(privateVarsInfo.blockArgs.size());
- int index = 0;
- for (auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] :
- llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.mlirVars,
- privateVarsInfo.blockArgs,
- taskStructMgr.getLLVMPrivateVarGEPs())) {
+ for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
+ privateVarsInfo.privatizers, privateVarsInfo.mlirVars,
+ privateVarsInfo.blockArgs, taskStructMgr.getLLVMPrivateVarGEPs()))) {
+ auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] = zip;
// To be handled inside the taskloop.
if (!privDecl.readsFromMold())
continue;
@@ -2667,7 +2666,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
if (!privateVarOrErr)
return handleError(privateVarOrErr, *taskloopOp.getOperation());
- llvmFirstPrivateVars[index++] = privateVarOrErr.get();
+ llvmFirstPrivateVars[i] = privateVarOrErr.get();
llvm::IRBuilderBase::InsertPointGuard guard(builder);
builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
@@ -2728,8 +2727,6 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
return privateVarOrError.takeError();
moduleTranslation.mapValue(blockArg, privateVarOrError.get());
privateVarsInfo.llvmVars[i] = privateVarOrError.get();
- // Add private var to llvmFirstPrivateVars
- llvmFirstPrivateVars[index++] = privateVarOrError.get();
}
taskStructMgr.createGEPsToPrivateVars();
@@ -2776,7 +2773,7 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// into the outlined function. When the task is duplicated, that structure
// is duplicated too.
if (failed(cleanupPrivateVars(builder, moduleTranslation,
- taskloopOp.getLoc(), llvmFirstPrivateVars,
+ taskloopOp.getLoc(), privateVarsInfo.llvmVars,
privateVarsInfo.privatizers)))
return llvm::make_error<PreviouslyReportedError>();
// Similarly, the task context structure freed inside the task is the
>From 5c0d99670c9694a8b0b63c156537d8c6fa46ec8f Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 16:57:19 +0000
Subject: [PATCH 11/16] Fix loop trip count
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b89d22a712569..4acb6595ecd3a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2321,7 +2321,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
- Value *TripCountMinusOne = Builder.CreateSub(LoadTaskUB, LoadTaskLB);
+ Value *TripCountMinusOne =
+ Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
>From 58cad17612b3a5f9a321f2b795a7e28a37789c9c Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 31 Dec 2025 13:47:00 +0000
Subject: [PATCH 12/16] [mlir][OpenMP] Implement OutlinableOpenMPOpInterface
for Taskloop
The body of taskloop is outlined and so OutlinableOpenMPOpInterface is
needed to ensure that language frontends know not to hoist allocas
outside of the body of taskloop.
The complication here is that taskloop is also a loop wrapper. Currently
some code assumes that taskloop contains only the wrapped loop, and so
there is no place to put the allocas other than in the loop body. This
is obviously not good. Unfortunately LLVM does not seem to be able to
hoist these allocas back out of the loop. The taskloop loop body will
need to contain stack saves and restores, which unfortunately hinder
some optimizations.
I think it is better to land some taskloop in LLVM 22 than not at all.
It will take more work to find an appropriate MLIR representation for
allocas inside of outlinable loop wrappers.
---
mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 1 +
mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td | 5 +++++
2 files changed, 6 insertions(+)
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index bbfe805eefe48..1fcd7b3c23e10 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -948,6 +948,7 @@ def TaskOp
def TaskloopOp : OpenMP_Op<"taskloop", traits = [
AttrSizedOperandSegments, AutomaticAllocationScope,
DeclareOpInterfaceMethods<ComposableOpInterface>,
+ DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,
DeclareOpInterfaceMethods<LoopWrapperInterface>, NoTerminator,
RecursiveMemoryEffects, SingleBlock
], clauses = [
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index d471e6c0ed70b..fd500134e10f9 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -182,6 +182,11 @@ def OutlineableOpenMPOpInterface : OpInterface<"OutlineableOpenMPOpInterface"> {
let methods = [
InterfaceMethod<"Get alloca block", "::mlir::Block*", "getAllocaBlock",
(ins), [{
+ // For taskloop: put the allocas inside of the wrapped loop. Loop wrappers
+ // are expected to contain only the wrapped loop (or another loop wrapper)
+ if (LoopWrapperInterface loopWrapper =
+ mlir::dyn_cast<LoopWrapperInterface>($_op.getOperation()))
+ return &loopWrapper.getWrappedLoop()->getRegion(0).front();
return &$_op.getRegion().front();
}]>,
];
>From ea9c289924e77788a54308d7f92b2188c161125c Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Sun, 4 Jan 2026 18:41:12 +0000
Subject: [PATCH 13/16] Update test
---
mlir/test/Target/LLVMIR/openmp-taskloop.mlir | 234 +++++++++----------
1 file changed, 116 insertions(+), 118 deletions(-)
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
index 8179784a47d90..5f31c547e7485 100644
--- a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir
@@ -32,122 +32,120 @@ llvm.func @_QPtest() {
llvm.return
}
-// CHECK: %struct.kmp_task_info = type { ptr, ptr, i32, ptr, ptr, i64, i64, i64 }
+// CHECK-LABEL: define void @_QPtest() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[VAL_0:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_1:.*]] = alloca i32, i64 1, align 4
+// CHECK: store i32 20, ptr %[[VAL_1]], align 4
+// CHECK: br label %[[VAL_2:.*]]
+// CHECK: entry: ; preds = %[[VAL_3:.*]]
+// CHECK: br label %[[VAL_4:.*]]
+// CHECK: omp.private.init: ; preds = %[[VAL_2]]
+// CHECK: %[[VAL_5:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
+// CHECK: %[[VAL_6:.*]] = getelementptr { i32 }, ptr %[[VAL_5]], i32 0, i32 0
+// CHECK: br label %[[VAL_7:.*]]
+// CHECK: omp.private.copy: ; preds = %[[VAL_4]]
+// CHECK: br label %[[VAL_8:.*]]
+// CHECK: omp.private.copy1: ; preds = %[[VAL_7]]
+// CHECK: %[[VAL_9:.*]] = load i32, ptr %[[VAL_1]], align 4
+// CHECK: store i32 %[[VAL_9]], ptr %[[VAL_6]], align 4
+// CHECK: br label %[[VAL_10:.*]]
+// CHECK: omp.taskloop.start: ; preds = %[[VAL_8]]
+// CHECK: br label %[[VAL_11:.*]]
+// CHECK: codeRepl: ; preds = %[[VAL_10]]
+// CHECK: %[[VAL_12:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK: store i64 1, ptr %[[VAL_12]], align 4
+// CHECK: %[[VAL_13:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 1
+// CHECK: store i64 5, ptr %[[VAL_13]], align 4
+// CHECK: %[[VAL_14:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 2
+// CHECK: store i64 1, ptr %[[VAL_14]], align 4
+// CHECK: %[[VAL_15:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 3
+// CHECK: store ptr %[[VAL_5]], ptr %[[VAL_15]], align 8
+// CHECK: %[[VAL_16:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[VAL_16]])
+// CHECK: %[[VAL_17:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[VAL_16]], i32 1, i64 40, i64 32, ptr @_QPtest..omp_par)
+// CHECK: %[[VAL_18:.*]] = load ptr, ptr %[[VAL_17]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_18]], ptr align 1 %[[STRUCTARG]], i64 32, i1 false)
+// CHECK: %[[VAL_19:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_18]], i32 0, i32 0
+// CHECK: %[[VAL_20:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_18]], i32 0, i32 1
+// CHECK: %[[VAL_21:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_18]], i32 0, i32 2
+// CHECK: %[[VAL_22:.*]] = load i64, ptr %[[VAL_21]], align 4
+// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[VAL_16]], ptr %[[VAL_17]], i32 1, ptr %[[VAL_19]], ptr %[[VAL_20]], i64 %[[VAL_22]], i32 1, i32 0, i64 0, ptr @omp_taskloop_dup)
+// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[VAL_16]])
+// CHECK: br label %[[VAL_23:.*]]
+// CHECK: taskloop.exit: ; preds = %[[VAL_11]]
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @_QPtest..omp_par(
+// CHECK: taskloop.alloca:
+// CHECK: %[[VAL_24:.*]] = load ptr, ptr %[[VAL_25:.*]], align 8
+// CHECK: %[[VAL_26:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 0
+// CHECK: %[[VAL_27:.*]] = load i64, ptr %[[VAL_26]], align 4
+// CHECK: %[[VAL_28:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 1
+// CHECK: %[[VAL_29:.*]] = load i64, ptr %[[VAL_28]], align 4
+// CHECK: %[[VAL_30:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 2
+// CHECK: %[[VAL_31:.*]] = load i64, ptr %[[VAL_30]], align 4
+// CHECK: %[[VAL_32:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_24]], i32 0, i32 3
+// CHECK: %[[VAL_33:.*]] = load ptr, ptr %[[VAL_32]], align 8, !align !1
+// CHECK: %[[VAL_34:.*]] = alloca i32, align 4
+// CHECK: br label %[[VAL_35:.*]]
+// CHECK: taskloop.body: ; preds = %[[VAL_36:.*]]
+// CHECK: %[[VAL_37:.*]] = getelementptr { i32 }, ptr %[[VAL_33]], i32 0, i32 0
+// CHECK: br label %[[VAL_38:.*]]
+// CHECK: omp.taskloop.region: ; preds = %[[VAL_35]]
+// CHECK: br label %[[VAL_39:.*]]
+// CHECK: omp_loop.preheader: ; preds = %[[VAL_38]]
+// CHECK: %[[VAL_40:.*]] = sub i64 %[[VAL_29]], %[[VAL_27]]
+// CHECK: %[[VAL_41:.*]] = sdiv i64 %[[VAL_40]], 1
+// CHECK: %[[VAL_42:.*]] = add i64 %[[VAL_41]], 1
+// CHECK: %[[VAL_43:.*]] = trunc i64 %[[VAL_42]] to i32
+// CHECK: %[[VAL_44:.*]] = trunc i64 %[[VAL_27]] to i32
+// CHECK: br label %[[VAL_45:.*]]
+// CHECK: omp_loop.header: ; preds = %[[VAL_46:.*]], %[[VAL_39]]
+// CHECK: %[[VAL_47:.*]] = phi i32 [ 0, %[[VAL_39]] ], [ %[[VAL_48:.*]], %[[VAL_46]] ]
+// CHECK: br label %[[VAL_49:.*]]
+// CHECK: omp_loop.cond: ; preds = %[[VAL_45]]
+// CHECK: %[[VAL_50:.*]] = icmp ult i32 %[[VAL_47]], %[[VAL_43]]
+// CHECK: br i1 %[[VAL_50]], label %[[VAL_51:.*]], label %[[VAL_52:.*]]
+// CHECK: omp_loop.exit: ; preds = %[[VAL_49]]
+// CHECK: br label %[[VAL_53:.*]]
+// CHECK: omp_loop.after: ; preds = %[[VAL_52]]
+// CHECK: br label %[[VAL_54:.*]]
+// CHECK: omp.region.cont: ; preds = %[[VAL_53]]
+// CHECK: tail call void @free(ptr %[[VAL_33]])
+// CHECK: br label %[[VAL_55:.*]]
+// CHECK: omp_loop.body: ; preds = %[[VAL_49]]
+// CHECK: %[[VAL_56:.*]] = mul i32 %[[VAL_47]], 1
+// CHECK: %[[VAL_57:.*]] = add i32 %[[VAL_56]], %[[VAL_44]]
+// CHECK: br label %[[VAL_58:.*]]
+// CHECK: omp.loop_nest.region: ; preds = %[[VAL_51]]
+// CHECK: store i32 %[[VAL_57]], ptr %[[VAL_34]], align 4
+// CHECK: %[[VAL_59:.*]] = load i32, ptr %[[VAL_37]], align 4
+// CHECK: %[[VAL_60:.*]] = add i32 %[[VAL_59]], 1
+// CHECK: store i32 %[[VAL_60]], ptr %[[VAL_37]], align 4
+// CHECK: br label %[[VAL_61:.*]]
+// CHECK: omp.region.cont2: ; preds = %[[VAL_58]]
+// CHECK: br label %[[VAL_46]]
+// CHECK: omp_loop.inc: ; preds = %[[VAL_61]]
+// CHECK: %[[VAL_48]] = add nuw i32 %[[VAL_47]], 1
+// CHECK: br label %[[VAL_45]]
+// CHECK: taskloop.exit.exitStub: ; preds = %[[VAL_54]]
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @omp_taskloop_dup(
+// CHECK: entry:
+// CHECK: %[[VAL_62:.*]] = getelementptr { %[[VAL_63:.*]], { i64, i64, i64, ptr } }, ptr %[[VAL_64:.*]], i32 0, i32 1
+// CHECK: %[[VAL_65:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_62]], i32 0, i32 3
+// CHECK: %[[VAL_66:.*]] = getelementptr { %[[VAL_63]], { i64, i64, i64, ptr } }, ptr %[[VAL_67:.*]], i32 0, i32 1
+// CHECK: %[[VAL_68:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_66]], i32 0, i32 3
+// CHECK: %[[VAL_69:.*]] = load ptr, ptr %[[VAL_68]], align 8
+// CHECK: %[[VAL_70:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
+// CHECK: store ptr %[[VAL_70]], ptr %[[VAL_65]], align 8
+// CHECK: %[[VAL_71:.*]] = getelementptr { i32 }, ptr %[[VAL_69]], i32 0, i32 0
+// CHECK: %[[VAL_72:.*]] = getelementptr { i32 }, ptr %[[VAL_70]], i32 0, i32 0
+// CHECK: br label %[[VAL_73:.*]]
+// CHECK: omp.private.copy: ; preds = %[[VAL_74:.*]]
+// CHECK: %[[VAL_75:.*]] = load i32, ptr %[[VAL_71]], align 4
+// CHECK: store i32 %[[VAL_75]], ptr %[[VAL_72]], align 4
+// CHECK: ret void
-// CHECK-LABEL: define void @_QPtest() {
-// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8
-// CHECK: %[[VAL1:.*]] = alloca i32, i64 1, align 4
-// CHECK: %[[VAL_X:.*]] = alloca i32, i64 1, align 4
-// CHECK: store i32 20, ptr %[[VAL_X]], align 4
-// CHECK: br label %entry
-
-// CHECK: entry:
-// CHECK: br label %omp.private.init
-
-// CHECK: omp.private.init: ; preds = %entry
-// CHECK: %[[OMP_TASK_CONTEXT_PTR:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64))
-// CHECK: %[[PRIV_GEP:.*]] = getelementptr { i32 }, ptr %[[OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
-// CHECK: br label %omp.private.copy
-
-// CHECK: omp.private.copy:
-// CHECK: br label %omp.private.copy1
-
-// CHECK: omp.private.copy1:
-// CHECK: %[[LOAD_X:.*]] = load i32, ptr %[[VAL_X]], align 4
-// CHECK: store i32 %[[LOAD_X]], ptr %[[PRIV_GEP]], align 4
-// CHECK: br label %omp.taskloop.start
-
-// CHECK: omp.taskloop.start:
-// CHECK: br label %codeRepl
-
-// CHECK: codeRepl:
-// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
-// CHECK: store ptr %[[OMP_TASK_CONTEXT_PTR]], ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8
-// CHECK: %[[GTID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
-// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]])
-// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par)
-// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5
-// CHECK: store i64 1, ptr %[[LB_GEP]], align 4
-// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6
-// CHECK: store i64 5, ptr %[[UB_GEP]], align 4
-// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7
-// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4
-// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4
-// CHECK: %10 = load ptr, ptr %[[TASK_PTR]], align 8
-// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %10, ptr align 1 %[[STRUCTARG]], i64 8, i1 false)
-// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[GTID]], ptr %[[TASK_PTR]], i32 1, ptr %[[LB_GEP]], ptr %[[UB_GEP]], i64 %[[LOAD_STEP]], i32 1, i32 0, i64 0, ptr null)
-// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[GTID]])
-// CHECK: br label %taskloop.exit
-
-// CHECK: taskloop.exit:
-// CHECK: tail call void @free(ptr %[[OMP_TASK_CONTEXT_PTR]])
-// CHECK: ret void
-// CHECK: }
-
-// CHECK-LABEL: define internal void @_QPtest..omp_par
-// CHECK-SAME: i32 %[[GLOBAL_TID:.*]], ptr %[[TASK_PTR1:.*]]) {
-// CHECK: taskloop.alloca:
-// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8
-// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5
-// CHECK: %[[LOAD_LB64:.*]] = load i64, ptr %[[GEP_LB]], align 4
-// CHECK: %[[LB:.*]] = trunc i64 %[[LOAD_LB64]] to i32
-// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6
-// CHECK: %[[LOAD_UB64:.*]] = load i64, ptr %[[GEP_UB]], align 4
-// CHECK: %[[UB:.*]] = trunc i64 %[[LOAD_UB64]] to i32
-// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0
-// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1
-// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4
-// CHECK: br label %taskloop.body
-
-// CHECK: taskloop.body:
-// CHECK: %[[LOAD_X:.*]] = getelementptr { i32 }, ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], i32 0, i32 0
-// CHECK: br label %omp.taskloop.region
-
-// CHECK: omp.taskloop.region:
-// CHECK: br label %omp_loop.preheader
-
-// CHECK: omp_loop.preheader:
-// CHECK: %[[VAL2:.*]] = sub i32 %[[UB]], %[[LB]]
-// CHECK: %[[TRIP_CNT:.*]] = add i32 %[[VAL2]], 1
-// CHECK: br label %omp_loop.header
-
-// CHECK: omp_loop.header:
-// CHECK: %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ]
-// CHECK: br label %omp_loop.cond
-
-// CHECK: omp_loop.cond:
-// CHECK: %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[TRIP_CNT]]
-// CHECK: br i1 %[[OMP_LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit
-
-// CHECK: omp_loop.exit:
-// CHECK: br label %omp_loop.after
-
-// CHECK: omp_loop.after:
-// CHECK: br label %omp.region.cont
-
-// CHECK: omp.region.cont:
-// CHECK: %[[IS_ALLOCATED:.*]] = icmp ne ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], null
-// CHECK: br label %taskloop.exit.exitStub
-
-// CHECK: omp_loop.body:
-// CHECK: %[[VAL3:.*]] = mul i32 %[[OMP_LOOP_IV]], 1
-// CHECK: %[[VAL5:.*]] = add i32 %[[VAL3]], %[[LB]]
-// CHECK: br label %omp.loop_nest.region
-
-// CHECK: omp.loop_nest.region:
-// CHECK: store i32 %[[VAL5]], ptr %[[OMP_PRIVATE_ALLOC]], align 4
-// CHECK: %[[VAL6:.*]] = load i32, ptr %[[LOAD_X]], align 4
-// CHECK: %[[RES:.*]] = add i32 %[[VAL6]], 1
-// CHECK: store i32 %[[RES]], ptr %[[LOAD_X]], align 4
-// CHECK: br label %omp.region.cont2
-
-// CHECK: omp.region.cont2:
-// CHECK: br label %omp_loop.inc
-
-// CHECK: omp_loop.inc:
-// CHECK: %omp_loop.next = add nuw i32 %[[OMP_LOOP_IV]], 1
-// CHECK: br label %omp_loop.header
-
-// CHECK: taskloop.exit.exitStub:
-// CHECK: ret void
-// CHECK: }
\ No newline at end of file
>From fd12bde790d70c38ffa9e728960a25832ab50a4a Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Mon, 5 Jan 2026 09:55:40 +0000
Subject: [PATCH 14/16] [NFC] Share body generation callback between task and
taskloop
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 273 +++++++-----------
1 file changed, 101 insertions(+), 172 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index b08d72da63aa1..500bcf39ddf14 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2374,11 +2374,105 @@ void TaskContextStructManager::freeStructPtr() {
builder.CreateFree(structPtr);
}
+using TaskLikeBodyGenCallbackTy =
+ std::function<llvm::Error(llvm::OpenMPIRBuilder::InsertPointTy allocaIP,
+ llvm::OpenMPIRBuilder::InsertPointTy codegenIP)>;
+
+/// Build the body generation callback shared by task-like constructs (task and
+/// taskloop).
+static TaskLikeBodyGenCallbackTy buildTaskLikeBodyGenCallback(
+ Operation *opInst, Region ®ion, StringRef regionName,
+ llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation,
+ PrivateVarsInfo &privateVarsInfo, TaskContextStructManager &taskStructMgr) {
+ using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+ return [&, regionName](InsertPointTy allocaIP,
+ InsertPointTy codegenIP) -> llvm::Error {
+ // Save the alloca insertion point on ModuleTranslation stack for use in
+ // nested regions.
+ LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
+ moduleTranslation, allocaIP);
+
+ // translate the body of the task:
+ builder.restoreIP(codegenIP);
+
+ llvm::BasicBlock *privInitBlock = nullptr;
+ privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
+ for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
+ privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
+ privateVarsInfo.mlirVars))) {
+ auto [blockArg, privDecl, mlirPrivVar] = zip;
+ // This is handled before the task executes
+ if (privDecl.readsFromMold())
+ continue;
+
+ llvm::IRBuilderBase::InsertPointGuard guard(builder);
+ llvm::Type *llvmAllocType =
+ moduleTranslation.convertType(privDecl.getType());
+ builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+ llvm::Value *llvmPrivateVar = builder.CreateAlloca(
+ llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
+
+ llvm::Expected<llvm::Value *> privateVarOrError =
+ initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
+ blockArg, llvmPrivateVar, privInitBlock);
+ if (!privateVarOrError)
+ return privateVarOrError.takeError();
+ moduleTranslation.mapValue(blockArg, privateVarOrError.get());
+ privateVarsInfo.llvmVars[i] = privateVarOrError.get();
+ }
+
+ taskStructMgr.createGEPsToPrivateVars();
+ for (auto [i, llvmPrivVar] :
+ llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
+ if (!llvmPrivVar) {
+ assert(privateVarsInfo.llvmVars[i] &&
+ "This is added in the loop above");
+ continue;
+ }
+ privateVarsInfo.llvmVars[i] = llvmPrivVar;
+ }
+
+ // Find and map the addresses of each variable within the task context
+ // structure
+ for (auto [blockArg, llvmPrivateVar, privateDecl] :
+ llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
+ privateVarsInfo.privatizers)) {
+ // This was handled above.
+ if (!privateDecl.readsFromMold())
+ continue;
+ // Fix broken pass-by-value case for Fortran character boxes
+ if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
+ llvmPrivateVar = builder.CreateLoad(
+ moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
+ }
+ assert(llvmPrivateVar->getType() ==
+ moduleTranslation.convertType(blockArg.getType()));
+ moduleTranslation.mapValue(blockArg, llvmPrivateVar);
+ }
+
+ auto continuationBlockOrError =
+ convertOmpOpRegions(region, regionName, builder, moduleTranslation);
+ if (failed(handleError(continuationBlockOrError, *opInst)))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
+
+ if (failed(cleanupPrivateVars(builder, moduleTranslation, opInst->getLoc(),
+ privateVarsInfo.llvmVars,
+ privateVarsInfo.privatizers)))
+ return llvm::make_error<PreviouslyReportedError>();
+
+ // Free heap allocated task context structure at the end of the task.
+ taskStructMgr.freeStructPtr();
+
+ return llvm::Error::success();
+ };
+}
+
/// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
static LogicalResult
convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
LLVM::ModuleTranslation &moduleTranslation) {
- using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
if (failed(checkImplementationStatus(*taskOp)))
return failure();
@@ -2491,88 +2585,9 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
// Set up for call to createTask()
builder.SetInsertPoint(taskStartBlock);
- auto bodyCB = [&](InsertPointTy allocaIP,
- InsertPointTy codegenIP) -> llvm::Error {
- // Save the alloca insertion point on ModuleTranslation stack for use in
- // nested regions.
- LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
- moduleTranslation, allocaIP);
-
- // translate the body of the task:
- builder.restoreIP(codegenIP);
-
- llvm::BasicBlock *privInitBlock = nullptr;
- privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
- for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
- privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
- privateVarsInfo.mlirVars))) {
- auto [blockArg, privDecl, mlirPrivVar] = zip;
- // This is handled before the task executes
- if (privDecl.readsFromMold())
- continue;
-
- llvm::IRBuilderBase::InsertPointGuard guard(builder);
- llvm::Type *llvmAllocType =
- moduleTranslation.convertType(privDecl.getType());
- builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
- llvm::Value *llvmPrivateVar = builder.CreateAlloca(
- llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
-
- llvm::Expected<llvm::Value *> privateVarOrError =
- initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
- blockArg, llvmPrivateVar, privInitBlock);
- if (!privateVarOrError)
- return privateVarOrError.takeError();
- moduleTranslation.mapValue(blockArg, privateVarOrError.get());
- privateVarsInfo.llvmVars[i] = privateVarOrError.get();
- }
-
- taskStructMgr.createGEPsToPrivateVars();
- for (auto [i, llvmPrivVar] :
- llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
- if (!llvmPrivVar) {
- assert(privateVarsInfo.llvmVars[i] &&
- "This is added in the loop above");
- continue;
- }
- privateVarsInfo.llvmVars[i] = llvmPrivVar;
- }
-
- // Find and map the addresses of each variable within the task context
- // structure
- for (auto [blockArg, llvmPrivateVar, privateDecl] :
- llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)) {
- // This was handled above.
- if (!privateDecl.readsFromMold())
- continue;
- // Fix broken pass-by-value case for Fortran character boxes
- if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
- llvmPrivateVar = builder.CreateLoad(
- moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
- }
- assert(llvmPrivateVar->getType() ==
- moduleTranslation.convertType(blockArg.getType()));
- moduleTranslation.mapValue(blockArg, llvmPrivateVar);
- }
-
- auto continuationBlockOrError = convertOmpOpRegions(
- taskOp.getRegion(), "omp.task.region", builder, moduleTranslation);
- if (failed(handleError(continuationBlockOrError, *taskOp)))
- return llvm::make_error<PreviouslyReportedError>();
-
- builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
-
- if (failed(cleanupPrivateVars(builder, moduleTranslation, taskOp.getLoc(),
- privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)))
- return llvm::make_error<PreviouslyReportedError>();
-
- // Free heap allocated task context structure at the end of the task.
- taskStructMgr.freeStructPtr();
-
- return llvm::Error::success();
- };
+ auto bodyCB = buildTaskLikeBodyGenCallback(
+ taskOp, taskOp.getRegion(), "omp.task.region", builder, moduleTranslation,
+ privateVarsInfo, taskStructMgr);
llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder();
SmallVector<llvm::BranchInst *> cancelTerminators;
@@ -2693,95 +2708,9 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// Set up inserttion point for call to createTaskloop()
builder.SetInsertPoint(taskloopStartBlock);
- auto bodyCB = [&](InsertPointTy allocaIP,
- InsertPointTy codegenIP) -> llvm::Error {
- // Save the alloca insertion point on ModuleTranslation stack for use in
- // nested regions.
- LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
- moduleTranslation, allocaIP);
-
- // translate the body of the taskloop:
- builder.restoreIP(codegenIP);
-
- llvm::BasicBlock *privInitBlock = nullptr;
- privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size());
- for (auto [i, zip] : llvm::enumerate(llvm::zip_equal(
- privateVarsInfo.blockArgs, privateVarsInfo.privatizers,
- privateVarsInfo.mlirVars))) {
- auto [blockArg, privDecl, mlirPrivVar] = zip;
- // This is handled before the task executes
- if (privDecl.readsFromMold())
- continue;
-
- llvm::IRBuilderBase::InsertPointGuard guard(builder);
- llvm::Type *llvmAllocType =
- moduleTranslation.convertType(privDecl.getType());
- builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
- llvm::Value *llvmPrivateVar = builder.CreateAlloca(
- llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc");
-
- llvm::Expected<llvm::Value *> privateVarOrError =
- initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar,
- blockArg, llvmPrivateVar, privInitBlock);
- if (!privateVarOrError)
- return privateVarOrError.takeError();
- moduleTranslation.mapValue(blockArg, privateVarOrError.get());
- privateVarsInfo.llvmVars[i] = privateVarOrError.get();
- }
-
- taskStructMgr.createGEPsToPrivateVars();
- for (auto [i, llvmPrivVar] :
- llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) {
- if (!llvmPrivVar) {
- assert(privateVarsInfo.llvmVars[i] &&
- "This is added in the loop above");
- continue;
- }
- privateVarsInfo.llvmVars[i] = llvmPrivVar;
- }
-
- // Find and map the addresses of each variable within the taskloop context
- // structure
- for (auto [blockArg, llvmPrivateVar, privateDecl] :
- llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)) {
- // This was handled above.
- if (!privateDecl.readsFromMold())
- continue;
- // Fix broken pass-by-value case for Fortran character boxes
- if (!mlir::isa<LLVM::LLVMPointerType>(blockArg.getType())) {
- llvmPrivateVar = builder.CreateLoad(
- moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar);
- }
- assert(llvmPrivateVar->getType() ==
- moduleTranslation.convertType(blockArg.getType()));
- moduleTranslation.mapValue(blockArg, llvmPrivateVar);
- }
-
- auto continuationBlockOrError =
- convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region",
- builder, moduleTranslation);
-
- if (failed(handleError(continuationBlockOrError, opInst)))
- return llvm::make_error<PreviouslyReportedError>();
-
- builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator());
-
- // This is freeing the private variables as mapped inside of the task: these
- // will be per-task private copies possibly after task duplication. This is
- // handled transparently by how these are passed to the structure passed
- // into the outlined function. When the task is duplicated, that structure
- // is duplicated too.
- if (failed(cleanupPrivateVars(builder, moduleTranslation,
- taskloopOp.getLoc(), privateVarsInfo.llvmVars,
- privateVarsInfo.privatizers)))
- return llvm::make_error<PreviouslyReportedError>();
- // Similarly, the task context structure freed inside the task is the
- // per-task copy after task duplication.
- taskStructMgr.freeStructPtr();
-
- return llvm::Error::success();
- };
+ auto bodyCB = buildTaskLikeBodyGenCallback(
+ &opInst, taskloopOp.getRegion(), "omp.taskloop.region", builder,
+ moduleTranslation, privateVarsInfo, taskStructMgr);
// Taskloop divides into an appropriate number of tasks by repeatedly
// duplicating the original task. Each time this is done, the task context
>From 912d1d236fa369f06e26ea2972ed91eeaa266dda Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Tue, 6 Jan 2026 13:05:12 +0000
Subject: [PATCH 15/16] Remove incorrect assertion
This was my mistake whilst tidying something up. We can have an empty
context structure at the point when this is run, and the helper is still
required to add enough dummy entries to the geps array so that its
dimensions match the size of the privatization decls. I've added a
regression test.
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 1 -
.../openmp-taskloop-no-context-struct.mlir | 128 ++++++++++++++++++
2 files changed, 128 insertions(+), 1 deletion(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-taskloop-no-context-struct.mlir
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 500bcf39ddf14..e089c56ca4990 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2334,7 +2334,6 @@ void TaskContextStructManager::generateTaskContextStruct() {
SmallVector<llvm::Value *> TaskContextStructManager::createGEPsToPrivateVars(
llvm::Value *altStructPtr) const {
- assert(!privateVarTypes.empty());
SmallVector<llvm::Value *> ret;
// Create GEPs for each struct member
diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop-no-context-struct.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop-no-context-struct.mlir
new file mode 100644
index 0000000000000..43b50e7a3206c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-taskloop-no-context-struct.mlir
@@ -0,0 +1,128 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Regression check for a taskloop with private variables but none of the
+// private variables go into the context struct.
+
+omp.private {type = private} @_QFtestEi_private_i32 : i32
+omp.private {type = private} @_QFtestEt2_private_i32 : i32
+omp.private {type = private} @_QFtestEt1_private_i32 : i32
+llvm.func @_QPtest() {
+ %0 = llvm.mlir.constant(1 : i32) : i32
+ %1 = llvm.mlir.constant(20 : i32) : i32
+ %2 = llvm.mlir.constant(1 : i64) : i64
+ %3 = llvm.alloca %2 x i32 {bindc_name = "t2"} : (i64) -> !llvm.ptr
+ %4 = llvm.alloca %2 x i32 {bindc_name = "t1"} : (i64) -> !llvm.ptr
+ %5 = llvm.alloca %2 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+ omp.taskloop private(@_QFtestEt1_private_i32 %4 -> %arg0, @_QFtestEt2_private_i32 %3 -> %arg1, @_QFtestEi_private_i32 %5 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+ omp.loop_nest (%arg3) : i32 = (%0) to (%1) inclusive step (%0) {
+ llvm.store %arg3, %arg2 : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ llvm.return
+}
+// CHECK-LABEL: define void @_QPtest() {
+// CHECK: %[[STRUCTARG:.*]] = alloca { i64, i64, i64, ptr }, align 8
+// CHECK: %[[VAL_0:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_1:.*]] = alloca i32, i64 1, align 4
+// CHECK: %[[VAL_2:.*]] = alloca i32, i64 1, align 4
+// CHECK: br label %[[VAL_3:.*]]
+// CHECK: entry: ; preds = %[[VAL_4:.*]]
+// CHECK: br label %[[VAL_5:.*]]
+// CHECK: omp.private.init: ; preds = %[[VAL_3]]
+// CHECK: %[[VAL_6:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({}, ptr null, i32 1) to i64))
+// CHECK: br label %[[VAL_7:.*]]
+// CHECK: omp.private.copy: ; preds = %[[VAL_5]]
+// CHECK: br label %[[VAL_8:.*]]
+// CHECK: omp.taskloop.start: ; preds = %[[VAL_7]]
+// CHECK: br label %[[VAL_9:.*]]
+// CHECK: codeRepl: ; preds = %[[VAL_8]]
+// CHECK: %[[VAL_10:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 0
+// CHECK: store i64 1, ptr %[[VAL_10]], align 4
+// CHECK: %[[VAL_11:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 1
+// CHECK: store i64 20, ptr %[[VAL_11]], align 4
+// CHECK: %[[VAL_12:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 2
+// CHECK: store i64 1, ptr %[[VAL_12]], align 4
+// CHECK: %[[VAL_13:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[STRUCTARG]], i32 0, i32 3
+// CHECK: store ptr %[[VAL_6]], ptr %[[VAL_13]], align 8
+// CHECK: %[[VAL_14:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[VAL_14]])
+// CHECK: %[[VAL_15:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[VAL_14]], i32 1, i64 40, i64 32, ptr @_QPtest..omp_par)
+// CHECK: %[[VAL_16:.*]] = load ptr, ptr %[[VAL_15]], align 8
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %[[VAL_16]], ptr align 1 %[[STRUCTARG]], i64 32, i1 false)
+// CHECK: %[[VAL_17:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_16]], i32 0, i32 0
+// CHECK: %[[VAL_18:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_16]], i32 0, i32 1
+// CHECK: %[[VAL_19:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_16]], i32 0, i32 2
+// CHECK: %[[VAL_20:.*]] = load i64, ptr %[[VAL_19]], align 4
+// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[VAL_14]], ptr %[[VAL_15]], i32 1, ptr %[[VAL_17]], ptr %[[VAL_18]], i64 %[[VAL_20]], i32 1, i32 0, i64 0, ptr @omp_taskloop_dup)
+// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[VAL_14]])
+// CHECK: br label %[[VAL_21:.*]]
+// CHECK: taskloop.exit: ; preds = %[[VAL_9]]
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @_QPtest..omp_par
+// CHECK: taskloop.alloca:
+// CHECK: %[[VAL_22:.*]] = load ptr, ptr %[[VAL_23:.*]], align 8
+// CHECK: %[[VAL_24:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_22]], i32 0, i32 0
+// CHECK: %[[VAL_25:.*]] = load i64, ptr %[[VAL_24]], align 4
+// CHECK: %[[VAL_26:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_22]], i32 0, i32 1
+// CHECK: %[[VAL_27:.*]] = load i64, ptr %[[VAL_26]], align 4
+// CHECK: %[[VAL_28:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_22]], i32 0, i32 2
+// CHECK: %[[VAL_29:.*]] = load i64, ptr %[[VAL_28]], align 4
+// CHECK: %[[VAL_30:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_22]], i32 0, i32 3
+// CHECK: %[[VAL_31:.*]] = load ptr, ptr %[[VAL_30]], align 8, !align !1
+// CHECK: %[[VAL_32:.*]] = alloca i32, align 4
+// CHECK: %[[VAL_33:.*]] = alloca i32, align 4
+// CHECK: %[[VAL_34:.*]] = alloca i32, align 4
+// CHECK: br label %[[VAL_35:.*]]
+// CHECK: taskloop.body: ; preds = %[[VAL_36:.*]]
+// CHECK: br label %[[VAL_37:.*]]
+// CHECK: omp.taskloop.region: ; preds = %[[VAL_35]]
+// CHECK: br label %[[VAL_38:.*]]
+// CHECK: omp_loop.preheader: ; preds = %[[VAL_37]]
+// CHECK: %[[VAL_39:.*]] = sub i64 %[[VAL_27]], %[[VAL_25]]
+// CHECK: %[[VAL_40:.*]] = sdiv i64 %[[VAL_39]], 1
+// CHECK: %[[VAL_41:.*]] = add i64 %[[VAL_40]], 1
+// CHECK: %[[VAL_42:.*]] = trunc i64 %[[VAL_41]] to i32
+// CHECK: %[[VAL_43:.*]] = trunc i64 %[[VAL_25]] to i32
+// CHECK: br label %[[VAL_44:.*]]
+// CHECK: omp_loop.header: ; preds = %[[VAL_45:.*]], %[[VAL_38]]
+// CHECK: %[[VAL_46:.*]] = phi i32 [ 0, %[[VAL_38]] ], [ %[[VAL_47:.*]], %[[VAL_45]] ]
+// CHECK: br label %[[VAL_48:.*]]
+// CHECK: omp_loop.cond: ; preds = %[[VAL_44]]
+// CHECK: %[[VAL_49:.*]] = icmp ult i32 %[[VAL_46]], %[[VAL_42]]
+// CHECK: br i1 %[[VAL_49]], label %[[VAL_50:.*]], label %[[VAL_51:.*]]
+// CHECK: omp_loop.exit: ; preds = %[[VAL_48]]
+// CHECK: br label %[[VAL_52:.*]]
+// CHECK: omp_loop.after: ; preds = %[[VAL_51]]
+// CHECK: br label %[[VAL_53:.*]]
+// CHECK: omp.region.cont: ; preds = %[[VAL_52]]
+// CHECK: tail call void @free(ptr %[[VAL_31]])
+// CHECK: br label %[[VAL_54:.*]]
+// CHECK: omp_loop.body: ; preds = %[[VAL_48]]
+// CHECK: %[[VAL_55:.*]] = mul i32 %[[VAL_46]], 1
+// CHECK: %[[VAL_56:.*]] = add i32 %[[VAL_55]], %[[VAL_43]]
+// CHECK: br label %[[VAL_57:.*]]
+// CHECK: omp.loop_nest.region: ; preds = %[[VAL_50]]
+// CHECK: store i32 %[[VAL_56]], ptr %[[VAL_34]], align 4
+// CHECK: br label %[[VAL_58:.*]]
+// CHECK: omp.region.cont3: ; preds = %[[VAL_57]]
+// CHECK: br label %[[VAL_45]]
+// CHECK: omp_loop.inc: ; preds = %[[VAL_58]]
+// CHECK: %[[VAL_47]] = add nuw i32 %[[VAL_46]], 1
+// CHECK: br label %[[VAL_44]]
+// CHECK: taskloop.exit.exitStub: ; preds = %[[VAL_53]]
+// CHECK: ret void
+
+// CHECK-LABEL: define internal void @omp_taskloop_dup(
+// CHECK: entry:
+// CHECK: %[[VAL_59:.*]] = getelementptr { %[[VAL_60:.*]], { i64, i64, i64, ptr } }, ptr %[[VAL_61:.*]], i32 0, i32 1
+// CHECK: %[[VAL_62:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_59]], i32 0, i32 3
+// CHECK: %[[VAL_63:.*]] = getelementptr { %[[VAL_60]], { i64, i64, i64, ptr } }, ptr %[[VAL_64:.*]], i32 0, i32 1
+// CHECK: %[[VAL_65:.*]] = getelementptr { i64, i64, i64, ptr }, ptr %[[VAL_63]], i32 0, i32 3
+// CHECK: %[[VAL_66:.*]] = load ptr, ptr %[[VAL_65]], align 8
+// TODO: don't generate allocation for empty task context struct (for later patch)
+// CHECK: %[[VAL_67:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({}, ptr null, i32 1) to i64))
+// CHECK: store ptr %[[VAL_67]], ptr %[[VAL_62]], align 8
+// CHECK: ret void
+
>From b5f5567c11dc0b1e583bc3f701b044003678347a Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 7 Jan 2026 16:11:45 +0000
Subject: [PATCH 16/16] Fix argument names
---
.../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e089c56ca4990..8e0668f9d540b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2714,11 +2714,11 @@ convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder,
// Taskloop divides into an appropriate number of tasks by repeatedly
// duplicating the original task. Each time this is done, the task context
// structure must be duplicated too.
- auto taskDupCB = [&](InsertPointTy AllocaIP, InsertPointTy CodegenIP,
+ auto taskDupCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP,
llvm::Value *destPtr, llvm::Value *srcPtr)
-> llvm::Expected<llvm::IRBuilderBase::InsertPoint> {
llvm::IRBuilderBase::InsertPointGuard guard(builder);
- builder.restoreIP(CodegenIP);
+ builder.restoreIP(codegenIP);
llvm::Type *ptrTy =
builder.getPtrTy(srcPtr->getType()->getPointerAddressSpace());
More information about the Mlir-commits
mailing list