[Mlir-commits] [clang] [llvm] [mlir] [MLIR][OpenMP] Add codegen for teams reductions (PR #133310)
Jan Leyonberg
llvmlistbot at llvm.org
Thu Mar 27 13:42:37 PDT 2025
https://github.com/jsjodin updated https://github.com/llvm/llvm-project/pull/133310
>From 50fefbb31f4de7352c241c48fe5382785daaef21 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Sun, 23 Mar 2025 09:56:51 -0400
Subject: [PATCH 1/9] Initial modifications to support reductions in flang.
---
clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 3 +-
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 3 --
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 30 +++++++++++++++----
3 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index feb2448297542..d30bef9e7f0ba 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1659,7 +1659,6 @@ void CGOpenMPRuntimeGPU::emitReduction(
return;
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
- bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
ASTContext &C = CGM.getContext();
@@ -1756,7 +1755,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
cantFail(OMPBuilder.createReductionsGPU(
OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
- DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+ llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
CGF.getTarget().getGridValue(),
C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc));
CGF.Builder.restoreIP(AfterIP);
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 28909cef4748d..9b67d0c050e46 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1905,8 +1905,6 @@ class OpenMPIRBuilder {
/// nowait.
/// \param IsTeamsReduction Optional flag set if it is a teams
/// reduction.
- /// \param HasDistribute Optional flag set if it is a
- /// distribute reduction.
/// \param GridValue Optional GPU grid value.
/// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
/// used for teams reduction.
@@ -1915,7 +1913,6 @@ class OpenMPIRBuilder {
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
bool IsNoWait = false, bool IsTeamsReduction = false,
- bool HasDistribute = false,
ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
Value *SrcLocInfo = nullptr);
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2e5ce5308eea5..0001626086d7c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3495,9 +3495,9 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
const LocationDescription &Loc, InsertPointTy AllocaIP,
InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
- bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
- ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
- unsigned ReductionBufNum, Value *SrcLocInfo) {
+ bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
+ std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
+ Value *SrcLocInfo) {
if (!updateToLocation(Loc))
return InsertPointTy();
Builder.restoreIP(CodeGenIP);
@@ -3514,6 +3514,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
if (ReductionInfos.size() == 0)
return Builder.saveIP();
+ BasicBlock *ContinuationBlock = nullptr;
+ if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
+ // Copied code from createReductions
+ BasicBlock *InsertBlock = Loc.IP.getBlock();
+ ContinuationBlock =
+ InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
+ InsertBlock->getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
+ }
+
Function *CurFunc = Builder.GetInsertBlock()->getParent();
AttributeList FuncAttrs;
AttrBuilder AttrBldr(Ctx);
@@ -3669,11 +3679,21 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
ReductionFunc;
});
} else {
- assert(false && "Unhandled ReductionGenCBKind");
+ Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
+ Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
+ Value *Reduced;
+ InsertPointOrErrorTy AfterIP =
+ RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
+ if (!AfterIP)
+ return AfterIP.takeError();
+ Builder.CreateStore(Reduced, LHS, false);
}
}
emitBlock(ExitBB, CurFunc);
-
+ if (ContinuationBlock) {
+ Builder.CreateBr(ContinuationBlock);
+ Builder.SetInsertPoint(ContinuationBlock);
+ }
Config.setEmitLLVMUsed();
return Builder.saveIP();
>From fc3a9d0311abcc9e0fd0174c89214613c8afc14e Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Sun, 23 Mar 2025 10:11:39 -0400
Subject: [PATCH 2/9] Prepare for reduction support
---
.../llvm/Frontend/OpenMP/OMPIRBuilder.h | 5 +-
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 143 ++++++++++++------
2 files changed, 98 insertions(+), 50 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 9b67d0c050e46..a3a266e3f0a98 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1984,7 +1984,8 @@ class OpenMPIRBuilder {
InsertPointTy AllocaIP,
ArrayRef<ReductionInfo> ReductionInfos,
ArrayRef<bool> IsByRef,
- bool IsNoWait = false);
+ bool IsNoWait = false,
+ bool IsTeamsReduction = false);
///}
@@ -2268,6 +2269,8 @@ class OpenMPIRBuilder {
int32_t MinTeams = 1;
SmallVector<int32_t, 3> MaxThreads = {-1};
int32_t MinThreads = 1;
+ int32_t ReductionDataSize = 0;
+ int32_t ReductionBufferLength = 0;
};
/// Container to pass LLVM IR runtime values or constants related to the
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 0001626086d7c..6eb3ae7b60eb9 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3708,27 +3708,95 @@ static Function *getFreshReductionFunc(Module &M) {
".omp.reduction.func", &M);
}
-OpenMPIRBuilder::InsertPointOrErrorTy
-OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
- InsertPointTy AllocaIP,
- ArrayRef<ReductionInfo> ReductionInfos,
- ArrayRef<bool> IsByRef, bool IsNoWait) {
- assert(ReductionInfos.size() == IsByRef.size());
- for (const ReductionInfo &RI : ReductionInfos) {
- (void)RI;
- assert(RI.Variable && "expected non-null variable");
- assert(RI.PrivateVariable && "expected non-null private variable");
- assert(RI.ReductionGen && "expected non-null reduction generator callback");
- assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
- "expected variables and their private equivalents to have the same "
- "type");
- assert(RI.Variable->getType()->isPointerTy() &&
- "expected variables to be pointers");
+static Error populateReductionFunction(
+ Function *ReductionFunc,
+ ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+ IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
+ Module *Module = ReductionFunc->getParent();
+ BasicBlock *ReductionFuncBlock =
+ BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+ Builder.SetInsertPoint(ReductionFuncBlock);
+ Value *LHSArrayPtr = nullptr;
+ Value *RHSArrayPtr = nullptr;
+ if (IsGPU) {
+ // Need to alloca memory here and deal with the pointers before getting
+ // LHS/RHS pointers out
+ //
+ Argument *Arg0 = ReductionFunc->getArg(0);
+ Argument *Arg1 = ReductionFunc->getArg(1);
+ Type *Arg0Type = Arg0->getType();
+ Type *Arg1Type = Arg1->getType();
+
+ Value *LHSAlloca =
+ Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+ Value *RHSAlloca =
+ Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+ Value *LHSAddrCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
+ Value *RHSAddrCast =
+ Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
+ Builder.CreateStore(Arg0, LHSAddrCast);
+ Builder.CreateStore(Arg1, RHSAddrCast);
+ LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+ RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
+ } else {
+ LHSArrayPtr = ReductionFunc->getArg(0);
+ RHSArrayPtr = ReductionFunc->getArg(1);
+ }
+
+ unsigned NumReductions = ReductionInfos.size();
+ Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+ for (auto En : enumerate(ReductionInfos)) {
+ const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+ Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, LHSArrayPtr, 0, En.index());
+ Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+ Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ LHSI8Ptr, RI.Variable->getType());
+ Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+ Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+ RedArrayTy, RHSArrayPtr, 0, En.index());
+ Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+ Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+ RHSI8Ptr, RI.PrivateVariable->getType());
+ Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+ Value *Reduced;
+ OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
+ RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
+ if (!AfterIP)
+ return AfterIP.takeError();
+
+ Builder.restoreIP(*AfterIP);
+ // TODO: Consider flagging an error.
+ if (!Builder.GetInsertBlock())
+ return Error::success();
+
+ // store is inside of the reduction region when using by-ref
+ if (!IsByRef[En.index()])
+ Builder.CreateStore(Reduced, LHSPtr);
}
+ Builder.CreateRetVoid();
+ return Error::success();
+}
+
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
+ const LocationDescription &Loc, InsertPointTy AllocaIP,
+ ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
+ bool IsNoWait, bool IsTeamsReduction) {
+ assert(ReductionInfos.size() == IsByRef.size());
+ if (Config.isGPU())
+ return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
+ IsNoWait, IsTeamsReduction);
+
+ checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
if (!updateToLocation(Loc))
return InsertPointTy();
+ if (ReductionInfos.size() == 0)
+ return Builder.saveIP();
+
BasicBlock *InsertBlock = Loc.IP.getBlock();
BasicBlock *ContinuationBlock =
InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -3852,38 +3920,13 @@ OpenMPIRBuilder::createReductions(const LocationDescription &Loc,
// Populate the outlined reduction function using the elementwise reduction
// function. Partial values are extracted from the type-erased array of
// pointers to private variables.
- BasicBlock *ReductionFuncBlock =
- BasicBlock::Create(Module->getContext(), "", ReductionFunc);
- Builder.SetInsertPoint(ReductionFuncBlock);
- Value *LHSArrayPtr = ReductionFunc->getArg(0);
- Value *RHSArrayPtr = ReductionFunc->getArg(1);
+ Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
+ IsByRef, false);
+ if (Err)
+ return Err;
- for (auto En : enumerate(ReductionInfos)) {
- const ReductionInfo &RI = En.value();
- Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, LHSArrayPtr, 0, En.index());
- Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
- Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
- Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
- Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
- RedArrayTy, RHSArrayPtr, 0, En.index());
- Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
- Value *RHSPtr =
- Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
- Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
- Value *Reduced;
- InsertPointOrErrorTy AfterIP =
- RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
- if (!AfterIP)
- return AfterIP.takeError();
- Builder.restoreIP(*AfterIP);
- if (!Builder.GetInsertBlock())
- return InsertPointTy();
- // store is inside of the reduction region when using by-ref
- if (!IsByRef[En.index()])
- Builder.CreateStore(Reduced, LHSPtr);
- }
- Builder.CreateRetVoid();
+ if (!Builder.GetInsertBlock())
+ return InsertPointTy();
Builder.SetInsertPoint(ContinuationBlock);
return Builder.saveIP();
@@ -6259,8 +6302,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
- Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
- Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
+ Constant *ReductionDataSize =
+ ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
+ Constant *ReductionBufferLength =
+ ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
Function *Fn = getOrCreateRuntimeFunctionPtr(
omp::RuntimeFunction::OMPRTL___kmpc_target_init);
>From 8497219d2a269cb5d346214c0e48180f84174605 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Sun, 23 Mar 2025 10:53:02 -0400
Subject: [PATCH 3/9] Enable reductions
---
.../OpenMP/OpenMPToLLVMIRTranslation.cpp | 208 ++++++++++++++++--
1 file changed, 185 insertions(+), 23 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index d41489921bd13..155ea3f920617 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -265,7 +265,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
.Case([&](omp::TeamsOp op) {
checkAllocate(op, result);
checkPrivate(op, result);
- checkReduction(op, result);
})
.Case([&](omp::TaskOp op) {
checkAllocate(op, result);
@@ -1018,19 +1017,37 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
// variable allocated in the inlined region)
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
- deferredStores.emplace_back(phis[0], var);
-
- privateReductionVariables[i] = var;
- moduleTranslation.mapValue(reductionArgs[i], phis[0]);
- reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
+ // var->setName("private_redvar");
+
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
+ llvm::Value *castVar =
+ builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+ // TODO: I (Sergio) just guessed casting phis[0] like it's done for var is
+ // what's supposed to happen with this code coming from a merge from main,
+ // but I don't actually know. Someone more familiar with it needs to check
+ // this.
+ llvm::Value *castPhi =
+ builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy);
+
+ deferredStores.emplace_back(castPhi, castVar);
+
+ privateReductionVariables[i] = castVar;
+ moduleTranslation.mapValue(reductionArgs[i], castPhi);
+ reductionVariableMap.try_emplace(loop.getReductionVars()[i], castPhi);
} else {
assert(allocRegion.empty() &&
"allocaction is implicit for by-val reduction");
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
- moduleTranslation.mapValue(reductionArgs[i], var);
- privateReductionVariables[i] = var;
- reductionVariableMap.try_emplace(loop.getReductionVars()[i], var);
+ // var->setName("private_redvar");
+
+ llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
+ llvm::Value *castVar =
+ builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+
+ moduleTranslation.mapValue(reductionArgs[i], castVar);
+ privateReductionVariables[i] = castVar;
+ reductionVariableMap.try_emplace(loop.getReductionVars()[i], castVar);
}
}
@@ -1250,18 +1267,20 @@ static LogicalResult createReductionsAndCleanup(
LLVM::ModuleTranslation &moduleTranslation,
llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
- ArrayRef<llvm::Value *> privateReductionVariables, ArrayRef<bool> isByRef) {
+ ArrayRef<llvm::Value *> privateReductionVariables, ArrayRef<bool> isByRef,
+ bool isNowait = false, bool isTeamsReduction = false) {
// Process the reductions if required.
if (op.getNumReductionVars() == 0)
return success();
+ SmallVector<OwningReductionGen> owningReductionGens;
+ SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
+ SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
+
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
// Create the reduction generators. We need to own them here because
// ReductionInfo only accepts references to the generators.
- SmallVector<OwningReductionGen> owningReductionGens;
- SmallVector<OwningAtomicReductionGen> owningAtomicReductionGens;
- SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> reductionInfos;
collectReductionInfo(op, builder, moduleTranslation, reductionDecls,
owningReductionGens, owningAtomicReductionGens,
privateReductionVariables, reductionInfos);
@@ -1273,7 +1292,7 @@ static LogicalResult createReductionsAndCleanup(
builder.SetInsertPoint(tempTerminator);
llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint =
ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
- isByRef, op.getNowait());
+ isByRef, isNowait, isTeamsReduction);
if (failed(handleError(contInsertPoint, *op)))
return failure();
@@ -1666,9 +1685,9 @@ convertOmpSections(Operation &opInst, llvm::IRBuilderBase &builder,
builder.restoreIP(*afterIP);
// Process the reductions if required.
- return createReductionsAndCleanup(sectionsOp, builder, moduleTranslation,
- allocaIP, reductionDecls,
- privateReductionVariables, isByRef);
+ return createReductionsAndCleanup(
+ sectionsOp, builder, moduleTranslation, allocaIP, reductionDecls,
+ privateReductionVariables, isByRef, sectionsOp.getNowait());
}
/// Converts an OpenMP single construct into LLVM IR using OpenMPIRBuilder.
@@ -1714,6 +1733,43 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
return success();
}
+static bool teamsReductionContainedInDistribute(omp::TeamsOp teamsOp) {
+ auto iface =
+ llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(teamsOp.getOperation());
+ // Check that all uses of the reduction block arg has the same distribute op
+ // parent.
+ llvm::SmallVector<mlir::Operation *> debugUses;
+ Operation *distOp = nullptr;
+ for (auto ra : iface.getReductionBlockArgs())
+ for (auto &use : ra.getUses()) {
+ auto *useOp = use.getOwner();
+ // Ignore debug uses.
+ if (mlir::isa<LLVM::DbgDeclareOp>(useOp) ||
+ mlir::isa<LLVM::DbgValueOp>(useOp)) {
+ debugUses.push_back(useOp);
+ continue;
+ }
+
+ auto currentDistOp = useOp->getParentOfType<omp::DistributeOp>();
+ // Use is not inside a distribute op - return false
+ if (!currentDistOp)
+ return false;
+ // Multiple distribute operations - return false
+ Operation *currentOp = currentDistOp.getOperation();
+ if (distOp && (distOp != currentOp))
+ return false;
+
+ distOp = currentOp;
+ }
+
+ // If we are going to use distribute reduction then remove any debug uses of
+ // the reduction parameters in teamsOp. Otherwise they will be left without
+ // any mapped value in moduleTranslation and will eventually error out.
+ for (auto use : debugUses)
+ use->erase();
+ return true;
+}
+
// Convert an OpenMP Teams construct to LLVM IR using OpenMPIRBuilder
static LogicalResult
convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
@@ -1722,6 +1778,34 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
if (failed(checkImplementationStatus(*op)))
return failure();
+ DenseMap<Value, llvm::Value *> reductionVariableMap;
+ unsigned numReductionVars = op.getNumReductionVars();
+ SmallVector<omp::DeclareReductionOp> reductionDecls;
+ SmallVector<llvm::Value *> privateReductionVariables(numReductionVars);
+ llvm::ArrayRef<bool> isByRef;
+ llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+ findAllocaInsertPoint(builder, moduleTranslation);
+
+ // Only do teams reduction if there is no distribute op that captures the
+ // reduction instead.
+ bool doTeamsReduction = !teamsReductionContainedInDistribute(op);
+ if (doTeamsReduction) {
+ isByRef = getIsByRef(op.getReductionByref());
+
+ assert(isByRef.size() == op.getNumReductionVars());
+
+ MutableArrayRef<BlockArgument> reductionArgs =
+ llvm::cast<omp::BlockArgOpenMPOpInterface>(*op).getReductionBlockArgs();
+
+ collectReductionDecls(op, reductionDecls);
+
+ if (failed(allocAndInitializeReductionVars(
+ op, reductionArgs, builder, moduleTranslation, allocaIP,
+ reductionDecls, privateReductionVariables, reductionVariableMap,
+ isByRef)))
+ return failure();
+ }
+
auto bodyCB = [&](InsertPointTy allocaIP, InsertPointTy codegenIP) {
LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
moduleTranslation, allocaIP);
@@ -1756,6 +1840,13 @@ convertOmpTeams(omp::TeamsOp op, llvm::IRBuilderBase &builder,
return failure();
builder.restoreIP(*afterIP);
+ if (doTeamsReduction) {
+ // Process the reductions if required.
+ return createReductionsAndCleanup(
+ op, builder, moduleTranslation, allocaIP, reductionDecls,
+ privateReductionVariables, isByRef,
+ /*isNoWait*/ false, /*isTeamsReduction*/ true);
+ }
return success();
}
@@ -2273,9 +2364,10 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
return failure();
// Process the reductions if required.
- if (failed(createReductionsAndCleanup(wsloopOp, builder, moduleTranslation,
- allocaIP, reductionDecls,
- privateReductionVariables, isByRef)))
+ if (failed(createReductionsAndCleanup(
+ wsloopOp, builder, moduleTranslation, allocaIP, reductionDecls,
+ privateReductionVariables, isByRef, wsloopOp.getNowait(),
+ /*isTeamsReduction=*/false)))
return failure();
return cleanupPrivateVars(builder, moduleTranslation, wsloopOp.getLoc(),
@@ -2378,7 +2470,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder::InsertPointOrErrorTy contInsertPoint =
ompBuilder->createReductions(builder.saveIP(), allocaIP,
- reductionInfos, isByRef, false);
+ reductionInfos, isByRef, false, false);
if (!contInsertPoint)
return contInsertPoint.takeError();
@@ -4161,6 +4253,37 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
if (failed(checkImplementationStatus(opInst)))
return failure();
+ /// Process teams op reduction in distribute if the reduction is contained in
+ /// the distribute op.
+ omp::TeamsOp teamsOp = opInst.getParentOfType<omp::TeamsOp>();
+ bool doDistributeReduction =
+ teamsOp ? teamsReductionContainedInDistribute(teamsOp) : false;
+
+ DenseMap<Value, llvm::Value *> reductionVariableMap;
+ unsigned numReductionVars = teamsOp ? teamsOp.getNumReductionVars() : 0;
+ SmallVector<omp::DeclareReductionOp> reductionDecls;
+ SmallVector<llvm::Value *> privateReductionVariables(numReductionVars);
+ llvm::ArrayRef<bool> isByRef;
+
+ if (doDistributeReduction) {
+ isByRef = getIsByRef(teamsOp.getReductionByref());
+ assert(isByRef.size() == teamsOp.getNumReductionVars());
+
+ collectReductionDecls(teamsOp, reductionDecls);
+ llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
+ findAllocaInsertPoint(builder, moduleTranslation);
+
+ MutableArrayRef<BlockArgument> reductionArgs =
+ llvm::cast<omp::BlockArgOpenMPOpInterface>(*teamsOp)
+ .getReductionBlockArgs();
+
+ if (failed(allocAndInitializeReductionVars(
+ teamsOp, reductionArgs, builder, moduleTranslation, allocaIP,
+ reductionDecls, privateReductionVariables, reductionVariableMap,
+ isByRef)))
+ return failure();
+ }
+
using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
auto bodyGenCB = [&](InsertPointTy allocaIP,
InsertPointTy codeGenIP) -> llvm::Error {
@@ -4244,6 +4367,14 @@ convertOmpDistribute(Operation &opInst, llvm::IRBuilderBase &builder,
return failure();
builder.restoreIP(*afterIP);
+
+ if (doDistributeReduction) {
+ // Process the reductions if required.
+ return createReductionsAndCleanup(
+ teamsOp, builder, moduleTranslation, allocaIP, reductionDecls,
+ privateReductionVariables, isByRef,
+ /*isNoWait*/ false, /*isTeamsReduction*/ true);
+ }
return success();
}
@@ -4554,6 +4685,25 @@ static std::optional<int64_t> extractConstInteger(Value value) {
return std::nullopt;
}
+static uint64_t getTypeByteSize(mlir::Type type, const DataLayout &dl) {
+ uint64_t sizeInBits = dl.getTypeSizeInBits(type);
+ uint64_t sizeInBytes = sizeInBits / 8;
+ return sizeInBytes;
+}
+
+template <typename OpTy>
+static uint64_t getReductionDataSize(OpTy &op) {
+ if (op.getNumReductionVars() > 0) {
+ assert(op.getNumReductionVars() &&
+ "Only 1 reduction variable currently supported");
+ mlir::Type reductionVarTy = op.getReductionVars()[0].getType();
+ Operation *opp = op.getOperation();
+ DataLayout dl = DataLayout(opp->getParentOfType<ModuleOp>());
+ return getTypeByteSize(reductionVarTy, dl);
+ }
+ return 0;
+}
+
/// Populate default `MinTeams`, `MaxTeams` and `MaxThreads` to their default
/// values as stated by the corresponding clauses, if constant.
///
@@ -4563,7 +4713,7 @@ static std::optional<int64_t> extractConstInteger(Value value) {
static void
initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &attrs,
- bool isTargetDevice) {
+ bool isTargetDevice, bool isGPU) {
// TODO: Handle constant 'if' clauses.
Value numThreads, numTeamsLower, numTeamsUpper, threadLimit;
@@ -4645,12 +4795,23 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp,
(maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal))
combinedMaxThreadsVal = maxThreadsVal;
+ // Calculate reduction data size, limited to single reduction variable for
+ // now.
+ int32_t reductionDataSize = 0;
+ if (isGPU && capturedOp) {
+ if (auto teamsOp = castOrGetParentOfType<omp::TeamsOp>(capturedOp))
+ reductionDataSize = getReductionDataSize(teamsOp);
+ }
+
// Update kernel bounds structure for the `OpenMPIRBuilder` to use.
attrs.ExecFlags = targetOp.getKernelExecFlags(capturedOp);
attrs.MinTeams = minTeamsVal;
attrs.MaxTeams.front() = maxTeamsVal;
attrs.MinThreads = 1;
attrs.MaxThreads.front() = combinedMaxThreadsVal;
+ attrs.ReductionDataSize = reductionDataSize;
+ if (attrs.ReductionDataSize != 0)
+ attrs.ReductionBufferLength = 1024;
}
/// Gather LLVM runtime values for all clauses evaluated in the host that are
@@ -4731,6 +4892,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
bool isTargetDevice = ompBuilder->Config.isTargetDevice();
+ bool isGPU = ompBuilder->Config.isGPU();
auto parentFn = opInst.getParentOfType<LLVM::LLVMFuncOp>();
auto argIface = cast<omp::BlockArgOpenMPOpInterface>(opInst);
@@ -4933,7 +5095,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs;
Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp();
initTargetDefaultAttrs(targetOp, targetCapturedOp, defaultAttrs,
- isTargetDevice);
+ isTargetDevice, isGPU);
// Collect host-evaluated values needed to properly launch the kernel from the
// host.
>From 53216480793381000352b17bb675770374b5ed80 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Thu, 27 Mar 2025 09:48:50 -0400
Subject: [PATCH 4/9] Remove todo test.
---
mlir/test/Target/LLVMIR/openmp-todo.mlir | 28 ------------------------
1 file changed, 28 deletions(-)
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index af31f8bab73ac..7eafe396082e4 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -536,34 +536,6 @@ llvm.func @teams_private(%x : !llvm.ptr) {
// -----
-omp.declare_reduction @add_f32 : f32
-init {
-^bb0(%arg: f32):
- %0 = llvm.mlir.constant(0.0 : f32) : f32
- omp.yield (%0 : f32)
-}
-combiner {
-^bb1(%arg0: f32, %arg1: f32):
- %1 = llvm.fadd %arg0, %arg1 : f32
- omp.yield (%1 : f32)
-}
-atomic {
-^bb2(%arg2: !llvm.ptr, %arg3: !llvm.ptr):
- %2 = llvm.load %arg3 : !llvm.ptr -> f32
- llvm.atomicrmw fadd %arg2, %2 monotonic : !llvm.ptr, f32
- omp.yield
-}
-llvm.func @teams_reduction(%x : !llvm.ptr) {
- // expected-error at below {{not yet implemented: Unhandled clause reduction in omp.teams operation}}
- // expected-error at below {{LLVM Translation failed for operation: omp.teams}}
- omp.teams reduction(@add_f32 %x -> %prv : !llvm.ptr) {
- omp.terminator
- }
- llvm.return
-}
-
-// -----
-
llvm.func @wsloop_allocate(%lb : i32, %ub : i32, %step : i32, %x : !llvm.ptr) {
// expected-error at below {{not yet implemented: Unhandled clause allocate in omp.wsloop operation}}
// expected-error at below {{LLVM Translation failed for operation: omp.wsloop}}
>From d2aadda98a4eb29aefeb4390fedea1f9610e5608 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Thu, 27 Mar 2025 15:02:01 -0400
Subject: [PATCH 5/9] Add fix for tripcount.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6eb3ae7b60eb9..578ac1326010a 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4497,10 +4497,23 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
static void createTargetLoopWorkshareCall(
OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
- Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
- Type *TripCountTy = TripCount->getType();
+ Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) {
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
+ Value *TripCount = TripCountOrig;
+ // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
+ // not be the right way to fix it, but this works for now.
+ if (OMPBuilder->Config.isGPU()) {
+ Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
+ LLVMContext &Ctx = M.getContext();
+ Type *IVTy = TripCountOrig->getType();
+ Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32
+ ? Type::getInt32Ty(Ctx)
+ : Type::getInt64Ty(Ctx);
+ Constant *One = ConstantInt::get(InternalIVTy, 1);
+ TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count");
+ }
+ Type *TripCountTy = TripCount->getType();
FunctionCallee RTLFn =
getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
SmallVector<Value *, 8> RealArgs;
>From fc7043ab2b7d292d8de15dd22ba7a298c26682dc Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Thu, 27 Mar 2025 15:24:19 -0400
Subject: [PATCH 6/9] Add offload runtime tests.
---
.../basic-target-parallel-reduction.f90 | 27 +++++++++++++++++++
.../basic-target-teams-parallel-reduction.f90 | 27 +++++++++++++++++++
2 files changed, 54 insertions(+)
create mode 100644 offload/test/offloading/fortran/basic-target-parallel-reduction.f90
create mode 100644 offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90
diff --git a/offload/test/offloading/fortran/basic-target-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90
new file mode 100644
index 0000000000000..ce2bb714c8d0f
--- /dev/null
+++ b/offload/test/offloading/fortran/basic-target-parallel-reduction.f90
@@ -0,0 +1,27 @@
+! Basic offloading test with a target region
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+ use omp_lib
+ integer :: error = 0
+ integer :: i
+ integer :: sum = 0
+
+ !$omp target parallel do reduction(+:sum)
+ do i = 1, 100
+ sum = sum + i
+ end do
+ !$omp end target parallel do
+
+ if (sum /= 5050) then
+ error = 1
+ endif
+
+ print *,"number of errors: ", error
+
+end program main
+
+! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK: number of errors: 0
diff --git a/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90 b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90
new file mode 100644
index 0000000000000..950887bf05f66
--- /dev/null
+++ b/offload/test/offloading/fortran/basic-target-teams-parallel-reduction.f90
@@ -0,0 +1,27 @@
+! Basic offloading test with a target region
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+ use omp_lib
+ integer :: error = 0
+ integer :: i
+ integer :: sum = 0
+
+ !$omp target teams distribute parallel do reduction(+:sum)
+ do i = 1, 1000
+ sum = sum + i
+ end do
+ !$omp end target teams distribute parallel do
+
+ if (sum /= 500500) then
+ error = 1
+ endif
+
+ print *,"number of errors: ", error
+
+end program main
+
+! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK: number of errors: 0
>From 37c6b7bcb96c66c1e5e918c537a1c79b814b0ce4 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Thu, 27 Mar 2025 16:20:57 -0400
Subject: [PATCH 7/9] Fix tests, add new teams reduction test.
---
.../LLVMIR/omptarget-parallel-wsloop.mlir | 2 +-
.../LLVMIR/omptarget-wsloop-collapsed.mlir | 2 +-
mlir/test/Target/LLVMIR/omptarget-wsloop.mlir | 4 +-
.../Target/LLVMIR/openmp-teams-reduction.mlir | 71 +++++++++++++++++++
4 files changed, 75 insertions(+), 4 deletions(-)
create mode 100644 mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
index 649210795ff5c..0e6e1c3b83bf1 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -36,7 +36,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK-SAME: ptr %[[ARG_PTR:.*]])
// CHECK-SAME: #[[ATTRS1:[0-9]+]]
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB]] to ptr),
-// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 10,
+// CHECK-SAME: ptr @[[LOOP_BODY_FUNC:.*]], ptr %[[LOO_BODY_FUNC_ARG:.*]], i32 9,
// CHECK-SAME: i32 %[[THREAD_NUM:.*]], i32 0)
// CHECK: define internal void @[[LOOP_BODY_FUNC]](i32 %[[CNT:.*]], ptr %[[LOOP_BODY_ARG_PTR:.*]]) #[[ATTRS2:[0-9]+]] {
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
index b7aecec308ef3..2213a5b7a4709 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -24,7 +24,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
-// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
+// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 9999,
// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index e2a8d88bd181a..845647593108f 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -37,7 +37,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: %[[GEP:.*]] = getelementptr { ptr }, ptr addrspace(5) %[[STRUCTARG]], i32 0, i32 0
// CHECK: store ptr %[[ARG0]], ptr addrspace(5) %[[GEP]], align 8
// CHECK: %[[NUM_THREADS:.*]] = call i32 @omp_get_num_threads()
-// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 10, i32 %[[NUM_THREADS]], i32 0)
+// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr), ptr @[[LOOP_BODY_FN:.*]], ptr %[[STRUCTARG_ASCAST]], i32 9, i32 %[[NUM_THREADS]], i32 0)
// CHECK: define internal void @[[LOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
// CHECK: %[[GEP2:.*]] = getelementptr { ptr }, ptr %[[LOOP_BODY_ARG]], i32 0, i32 0
@@ -46,6 +46,6 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
// CHECK: store i32 %[[VAL0:.*]], ptr %[[GEP3]], align 4
// CHECK: define void @[[FUNC_EMPTY_WSLOOP:.*]]()
-// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 10, i32 %[[NUM_THREADS:.*]], i32 0)
+// CHECK: call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr), ptr @[[LOOP_EMPTY_BODY_FN:.*]], ptr null, i32 9, i32 %[[NUM_THREADS:.*]], i32 0)
// CHECK: define internal void @[[LOOP_EMPTY_BODY_FN]](i32 %[[LOOP_CNT:.*]])
diff --git a/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir
new file mode 100644
index 0000000000000..854723050b035
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-teams-reduction.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+// Only check the overall shape of the code and the presence of relevant
+// runtime calls. Actual IR checking is done at the OpenMPIRBuilder level.
+
+omp.private {type = private} @_QFsimple_teams_reductionEindex__private_i32 : i32
+omp.declare_reduction @add_reduction_i32 : i32 init {
+^bb0(%arg0: i32):
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ omp.yield(%0 : i32)
+} combiner {
+^bb0(%arg0: i32, %arg1: i32):
+ %0 = llvm.add %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+}
+llvm.func @simple_teams_reduction_() attributes {fir.internal_name = "_QPsimple_teams_reduction", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x i32 {bindc_name = "sum"} : (i64) -> !llvm.ptr
+ %2 = llvm.mlir.constant(1 : i64) : i64
+ %3 = llvm.alloca %2 x i32 {bindc_name = "index_"} : (i64) -> !llvm.ptr
+ %4 = llvm.mlir.constant(10000 : i32) : i32
+ %5 = llvm.mlir.constant(1 : i32) : i32
+ %6 = llvm.mlir.constant(0 : i32) : i32
+ %7 = llvm.mlir.constant(1 : i64) : i64
+ %8 = llvm.mlir.constant(1 : i64) : i64
+ llvm.store %6, %1 : i32, !llvm.ptr
+ omp.teams reduction(@add_reduction_i32 %1 -> %arg0 : !llvm.ptr) {
+ omp.distribute private(@_QFsimple_teams_reductionEindex__private_i32 %3 -> %arg1 : !llvm.ptr) {
+ omp.loop_nest (%arg2) : i32 = (%5) to (%4) inclusive step (%5) {
+ llvm.store %arg2, %arg1 : i32, !llvm.ptr
+ %9 = llvm.load %arg0 : !llvm.ptr -> i32
+ %10 = llvm.load %arg1 : !llvm.ptr -> i32
+ %11 = llvm.add %9, %10 : i32
+ llvm.store %11, %arg0 : i32, !llvm.ptr
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ llvm.return
+}
+// Call to outlined function
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %[[PRIVATE:.+]] = alloca i32
+// CHECK: store i32 0, ptr %[[PRIVATE]]
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Atomic version not generated
+// CHECK: unreachable
+
+// Non atomic version
+// CHECK: call void @__kmpc_end_reduce
+
+// Finalize
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+// CHECK: call void @__kmpc_barrier
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32
>From c198b85dfce8c06a6514c092cfd2a74cb8292dcb Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Thu, 27 Mar 2025 16:23:38 -0400
Subject: [PATCH 8/9] Fix comment.
---
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 578ac1326010a..b5e55dbccf464 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4501,8 +4501,9 @@ static void createTargetLoopWorkshareCall(
Module &M = OMPBuilder->M;
IRBuilder<> &Builder = OMPBuilder->Builder;
Value *TripCount = TripCountOrig;
- // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
- // not be the right way to fix it, but this works for now.
+ // The trip count is 1 larger than it should be for GPU, this is because
+ // of how the deviceRTL functions work with clang. TODO: make the trip
+ // count consistent between both so we don't have to subtract one here.
if (OMPBuilder->Config.isGPU()) {
Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
LLVMContext &Ctx = M.getContext();
>From 1be881a0ffb8542796df8b9a17de38ca519d5067 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Thu, 27 Mar 2025 16:41:47 -0400
Subject: [PATCH 9/9] Fix comments
---
.../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp | 6 ------
1 file changed, 6 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 155ea3f920617..84c4ee15ee8b0 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1017,15 +1017,10 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
// variable allocated in the inlined region)
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
- // var->setName("private_redvar");
llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
llvm::Value *castVar =
builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
- // TODO: I (Sergio) just guessed casting phis[0] like it's done for var is
- // what's supposed to happen with this code coming from a merge from main,
- // but I don't actually know. Someone more familiar with it needs to check
- // this.
llvm::Value *castPhi =
builder.CreatePointerBitCastOrAddrSpaceCast(phis[0], ptrTy);
@@ -1039,7 +1034,6 @@ allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
"allocaction is implicit for by-val reduction");
llvm::Value *var = builder.CreateAlloca(
moduleTranslation.convertType(reductionDecls[i].getType()));
- // var->setName("private_redvar");
llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
llvm::Value *castVar =
More information about the Mlir-commits
mailing list