[llvm] 517c3ae - [OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic
Kiran Chandramohan via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 16 08:10:18 PDT 2021
Author: Mats Petersson
Date: 2021-04-16T16:09:49+01:00
New Revision: 517c3aee4de59369b109fa27b1e41df8679a6cd3
URL: https://github.com/llvm/llvm-project/commit/517c3aee4de59369b109fa27b1e41df8679a6cd3
DIFF: https://github.com/llvm/llvm-project/commit/517c3aee4de59369b109fa27b1e41df8679a6cd3.diff
LOG: [OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic
The implementation supports static schedule for Fortran do loops. This
implements the dynamic variant of the same concept.
Reviewed By: Meinersbur
Differential Revision: https://reviews.llvm.org/D97393
Added:
Modified:
llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 4c67ea332aa92..0e0cb0189f5f0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -107,6 +107,17 @@ inline std::string getAllAssumeClauseOptions() {
return S + "'";
}
+/// \note This needs to be kept in sync with kmp.h enum sched_type.
+/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
+/// To complete this, more enum values will need to be moved here.
+enum class OMPScheduleType {
+ Static = 34, /**< static unspecialized */
+ DynamicChunked = 35,
+ ModifierNonmonotonic =
+ (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic)
+};
+
} // end namespace omp
} // end namespace llvm
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5a4b406649409..9c657cb6f1d3f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -355,7 +355,7 @@ class OpenMPIRBuilder {
/// \param CLI A descriptor of the canonical loop to workshare.
/// \param AllocaIP An insertion point for Alloca instructions usable in the
/// preheader of the loop.
- /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+ /// \param NeedsBarrier Indicates whether a barrier must be inserted after
/// the loop.
/// \param Chunk The size of loop chunk considered as a unit when
/// scheduling. If \p nullptr, defaults to 1.
@@ -367,6 +367,30 @@ class OpenMPIRBuilder {
bool NeedsBarrier,
Value *Chunk = nullptr);
+ /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
+ ///
+ /// This takes a \p LoopInfo representing a canonical loop, such as the one
+ /// created by \p createCanonicalLoop and emits additional instructions to
+ /// turn it into a workshare loop. In particular, it calls to an OpenMP
+ /// runtime function in the preheader to obtain, and then in each iteration
+ /// to update the loop counter.
+ /// \param Loc The source location description, the insertion location
+ /// is not used.
+ /// \param CLI A descriptor of the canonical loop to workshare.
+ /// \param AllocaIP An insertion point for Alloca instructions usable in the
+ /// preheader of the loop.
+ /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+ /// the loop.
+ /// \param Chunk The size of loop chunk considered as a unit when
+ /// scheduling. If \p nullptr, defaults to 1.
+ ///
+ /// \returns Point where to insert code after the loop.
+ InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
+ CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP,
+ bool NeedsBarrier,
+ Value *Chunk = nullptr);
+
/// Modifies the canonical loop to be a workshare loop.
///
/// This takes a \p LoopInfo representing a canonical loop, such as the one
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ec9ecced08e74..de93f644818f8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1168,10 +1168,8 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
- // TODO: extract scheduling type and map it to OMP constant. This is curently
- // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
- constexpr int StaticSchedType = 34;
- Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
+ Constant *SchedulingType =
+ ConstantInt::get(I32Type, static_cast<int>(OMPScheduleType::Static));
// Call the "init" function and update the trip count of the loop with the
// value it produced.
@@ -1220,6 +1218,148 @@ CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
}
+/// Returns an LLVM function to call for initializing loop bounds using OpenMP
+/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
+/// the runtime. Always interpret integers as unsigned similarly to
+/// CanonicalLoopInfo.
+static FunctionCallee
+getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
+ unsigned Bitwidth = Ty->getIntegerBitWidth();
+ if (Bitwidth == 32)
+ return OMPBuilder.getOrCreateRuntimeFunction(
+ M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
+ if (Bitwidth == 64)
+ return OMPBuilder.getOrCreateRuntimeFunction(
+ M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
+ llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+/// Returns an LLVM function to call for updating the next loop using OpenMP
+/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
+/// the runtime. Always interpret integers as unsigned similarly to
+/// CanonicalLoopInfo.
+static FunctionCallee
+getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
+ unsigned Bitwidth = Ty->getIntegerBitWidth();
+ if (Bitwidth == 32)
+ return OMPBuilder.getOrCreateRuntimeFunction(
+ M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
+ if (Bitwidth == 64)
+ return OMPBuilder.getOrCreateRuntimeFunction(
+ M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
+ llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
+ const LocationDescription &Loc, CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
+ // Set up the source location value for OpenMP runtime.
+ Builder.SetCurrentDebugLocation(Loc.DL);
+
+ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+ Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+
+ // Declare useful OpenMP runtime functions.
+ Value *IV = CLI->getIndVar();
+ Type *IVTy = IV->getType();
+ FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
+ FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
+
+ // Allocate space for computed loop bounds as expected by the "init" function.
+ Builder.restoreIP(AllocaIP);
+ Type *I32Type = Type::getInt32Ty(M.getContext());
+ Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
+ Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
+ Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
+ Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
+
+ // At the end of the preheader, prepare for calling the "init" function by
+ // storing the current loop bounds into the allocated space. A canonical loop
+ // always iterates from 0 to trip-count with step 1. Note that "init" expects
+ // and produces an inclusive upper bound.
+ BasicBlock *PreHeader = CLI->getPreheader();
+ Builder.SetInsertPoint(PreHeader->getTerminator());
+ Constant *One = ConstantInt::get(IVTy, 1);
+ Builder.CreateStore(One, PLowerBound);
+ Value *UpperBound = CLI->getTripCount();
+ Builder.CreateStore(UpperBound, PUpperBound);
+ Builder.CreateStore(One, PStride);
+
+ BasicBlock *Header = CLI->getHeader();
+ BasicBlock *Exit = CLI->getExit();
+ BasicBlock *Cond = CLI->getCond();
+ InsertPointTy AfterIP = CLI->getAfterIP();
+
+ // The CLI will be "broken" in the code below, as the loop is no longer
+ // a valid canonical loop.
+
+ if (!Chunk)
+ Chunk = One;
+
+ Value *ThreadNum = getOrCreateThreadID(SrcLoc);
+
+ OMPScheduleType DynamicSchedType =
+ OMPScheduleType::DynamicChunked | OMPScheduleType::ModifierNonmonotonic;
+ Constant *SchedulingType =
+ ConstantInt::get(I32Type, static_cast<int>(DynamicSchedType));
+
+ // Call the "init" function.
+ Builder.CreateCall(DynamicInit,
+ {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
+ UpperBound, /* step */ One, Chunk});
+
+ // An outer loop around the existing one.
+ BasicBlock *OuterCond = BasicBlock::Create(
+ PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
+ PreHeader->getParent());
+ // This needs to be 32-bit always, so can't use the IVTy Zero above.
+ Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
+ Value *Res =
+ Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
+ PLowerBound, PUpperBound, PStride});
+ Constant *Zero32 = ConstantInt::get(I32Type, 0);
+ Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
+ Value *LowerBound =
+ Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
+ Builder.CreateCondBr(MoreWork, Header, Exit);
+
+ // Change PHI-node in loop header to use outer cond rather than preheader,
+ // and set IV to the LowerBound.
+ Instruction *Phi = &Header->front();
+ auto *PI = cast<PHINode>(Phi);
+ PI->setIncomingBlock(0, OuterCond);
+ PI->setIncomingValue(0, LowerBound);
+
+ // Then set the pre-header to jump to the OuterCond
+ Instruction *Term = PreHeader->getTerminator();
+ auto *Br = cast<BranchInst>(Term);
+ Br->setSuccessor(0, OuterCond);
+
+ // Modify the inner condition:
+ // * Use the UpperBound returned from the DynamicNext call.
+ // * jump to the loop outer loop when done with one of the inner loops.
+ Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
+ UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
+ Instruction *Comp = &*Builder.GetInsertPoint();
+ auto *CI = cast<CmpInst>(Comp);
+ CI->setOperand(1, UpperBound);
+ // Redirect the inner exit to branch to outer condition.
+ Instruction *Branch = &Cond->back();
+ auto *BI = cast<BranchInst>(Branch);
+ assert(BI->getSuccessor(1) == Exit);
+ BI->setSuccessor(1, OuterCond);
+
+ // Add the barrier if requested.
+ if (NeedsBarrier) {
+ Builder.SetInsertPoint(&Exit->back());
+ createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+ omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
+ /* CheckCancelFlag */ false);
+ }
+
+ return AfterIP;
+}
+
/// Make \p Source branch to \p Target.
///
/// Handles two situations:
@@ -1901,7 +2041,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
Function *Fn =
- getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
+ getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
return Builder.CreateCall(Fn, Args);
}
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index da81367180492..77913e6711303 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -1708,6 +1708,105 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
EXPECT_EQ(NumCallsInExitBlock, 3u);
}
+TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) {
+ using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+ OpenMPIRBuilder OMPBuilder(*M);
+ OMPBuilder.initialize();
+ IRBuilder<> Builder(BB);
+ OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+ Type *LCTy = Type::getInt32Ty(Ctx);
+ Value *StartVal = ConstantInt::get(LCTy, 10);
+ Value *StopVal = ConstantInt::get(LCTy, 52);
+ Value *StepVal = ConstantInt::get(LCTy, 2);
+ Value *ChunkVal = ConstantInt::get(LCTy, 7);
+ auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
+
+ CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
+ Loc, LoopBodyGen, StartVal, StopVal, StepVal,
+ /*IsSigned=*/false, /*InclusiveStop=*/false);
+
+ Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+ InsertPointTy AllocaIP = Builder.saveIP();
+
+ // Collect all the info from CLI, as it isn't usable after the call to
+ // createDynamicWorkshareLoop.
+ InsertPointTy AfterIP = CLI->getAfterIP();
+ BasicBlock *Preheader = CLI->getPreheader();
+ BasicBlock *ExitBlock = CLI->getExit();
+ Value *IV = CLI->getIndVar();
+
+ InsertPointTy EndIP =
+ OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP,
+ /*NeedsBarrier=*/true, ChunkVal);
+ // The returned value should be the "after" point.
+ ASSERT_EQ(EndIP.getBlock(), AfterIP.getBlock());
+ ASSERT_EQ(EndIP.getPoint(), AfterIP.getPoint());
+
+ auto AllocaIter = BB->begin();
+ ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
+ AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
+ EXPECT_NE(PLastIter, nullptr);
+ EXPECT_NE(PLowerBound, nullptr);
+ EXPECT_NE(PUpperBound, nullptr);
+ EXPECT_NE(PStride, nullptr);
+
+ auto PreheaderIter = Preheader->begin();
+ ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 6);
+ StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+ StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+ StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+ ASSERT_NE(LowerBoundStore, nullptr);
+ ASSERT_NE(UpperBoundStore, nullptr);
+ ASSERT_NE(StrideStore, nullptr);
+
+ CallInst *ThreadIdCall = dyn_cast<CallInst>(&*(PreheaderIter++));
+ ASSERT_NE(ThreadIdCall, nullptr);
+ EXPECT_EQ(ThreadIdCall->getCalledFunction()->getName(),
+ "__kmpc_global_thread_num");
+
+ CallInst *InitCall = dyn_cast<CallInst>(&*PreheaderIter);
+
+ ASSERT_NE(InitCall, nullptr);
+ EXPECT_EQ(InitCall->getCalledFunction()->getName(),
+ "__kmpc_dispatch_init_4u");
+ EXPECT_EQ(InitCall->getNumArgOperands(), 7U);
+ EXPECT_EQ(InitCall->getArgOperand(6),
+ ConstantInt::get(Type::getInt32Ty(Ctx), 7));
+
+ ConstantInt *OrigLowerBound =
+ dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
+ ConstantInt *OrigUpperBound =
+ dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
+ ConstantInt *OrigStride =
+ dyn_cast<ConstantInt>(StrideStore->getValueOperand());
+ ASSERT_NE(OrigLowerBound, nullptr);
+ ASSERT_NE(OrigUpperBound, nullptr);
+ ASSERT_NE(OrigStride, nullptr);
+ EXPECT_EQ(OrigLowerBound->getValue(), 1);
+ EXPECT_EQ(OrigUpperBound->getValue(), 21);
+ EXPECT_EQ(OrigStride->getValue(), 1);
+
+ // The original loop iterator should only be used in the condition, in the
+ // increment and in the statement that adds the lower bound to it.
+ EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
+
+ // The exit block should contain the barrier call, plus the call to obtain
+ // the thread ID.
+ size_t NumCallsInExitBlock =
+ count_if(*ExitBlock, [](Instruction &I) { return isa<CallInst>(I); });
+ EXPECT_EQ(NumCallsInExitBlock, 2u);
+
+ // Add a termination to our block and check that it is internally consistent.
+ Builder.restoreIP(EndIP);
+ Builder.CreateRetVoid();
+ OMPBuilder.finalize();
+ EXPECT_FALSE(verifyModule(*M, &errs()));
+}
+
TEST_F(OpenMPIRBuilderTest, MasterDirective) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e7e9cb9590521..3cd201ad08a5c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -179,11 +179,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
if (loop.getNumLoops() != 1)
return opInst.emitOpError("collapsed loops not yet supported");
- if (loop.schedule_val().hasValue() &&
- omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) !=
- omp::ClauseScheduleKind::Static)
- return opInst.emitOpError(
- "only static (default) loop schedule is currently supported");
+ bool isStatic = true;
+
+ if (loop.schedule_val().hasValue()) {
+ auto schedule =
+ omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
+ if (schedule != omp::ClauseScheduleKind::Static &&
+ schedule != omp::ClauseScheduleKind::Dynamic)
+ return opInst.emitOpError("only static (default) and dynamic loop "
+ "schedule is currently supported");
+ isStatic = (schedule == omp::ClauseScheduleKind::Static);
+ }
// Find the loop configuration.
llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]);
@@ -241,11 +247,19 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
// Put them at the start of the current block for now.
llvm::OpenMPIRBuilder::InsertPointTy allocaIP(
insertBlock, insertBlock->getFirstInsertionPt());
- loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop(
- ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk);
+ llvm::OpenMPIRBuilder::InsertPointTy afterIP;
+ llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+ if (isStatic) {
+ loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP,
+ !loop.nowait(), chunk);
+ afterIP = loopInfo->getAfterIP();
+ } else {
+ afterIP = ompBuilder->createDynamicWorkshareLoop(ompLoc, loopInfo, allocaIP,
+ !loop.nowait(), chunk);
+ }
// Continue building IR after the loop.
- builder.restoreIP(loopInfo->getAfterIP());
+ builder.restoreIP(afterIP);
return success();
}
More information about the llvm-commits
mailing list