[llvm] 517c3ae - [OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic

Kiran Chandramohan via llvm-commits llvm-commits at lists.llvm.org
Fri Apr 16 08:10:18 PDT 2021


Author: Mats Petersson
Date: 2021-04-16T16:09:49+01:00
New Revision: 517c3aee4de59369b109fa27b1e41df8679a6cd3

URL: https://github.com/llvm/llvm-project/commit/517c3aee4de59369b109fa27b1e41df8679a6cd3
DIFF: https://github.com/llvm/llvm-project/commit/517c3aee4de59369b109fa27b1e41df8679a6cd3.diff

LOG: [OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic

The implementation supports static schedule for Fortran do loops. This
implements the dynamic variant of the same concept.

Reviewed By: Meinersbur

Differential Revision: https://reviews.llvm.org/D97393

Added: 
    

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
    llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
    llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 4c67ea332aa92..0e0cb0189f5f0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -107,6 +107,17 @@ inline std::string getAllAssumeClauseOptions() {
   return S + "'";
 }
 
+/// \note This needs to be kept in sync with kmp.h enum sched_type.
+/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
+///       To complete this, more enum values will need to be moved here.
+enum class OMPScheduleType {
+  Static = 34, /**< static unspecialized */
+  DynamicChunked = 35,
+  ModifierNonmonotonic =
+      (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic)
+};
+
 } // end namespace omp
 
 } // end namespace llvm

diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5a4b406649409..9c657cb6f1d3f 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -355,7 +355,7 @@ class OpenMPIRBuilder {
   /// \param CLI      A descriptor of the canonical loop to workshare.
   /// \param AllocaIP An insertion point for Alloca instructions usable in the
   ///                 preheader of the loop.
-  /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+  /// \param NeedsBarrier Indicates whether a barrier must be inserted after
   ///                     the loop.
   /// \param Chunk    The size of loop chunk considered as a unit when
   ///                 scheduling. If \p nullptr, defaults to 1.
@@ -367,6 +367,30 @@ class OpenMPIRBuilder {
                                                bool NeedsBarrier,
                                                Value *Chunk = nullptr);
 
+  /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
+  ///
+  /// This takes a \p LoopInfo representing a canonical loop, such as the one
+  /// created by \p createCanonicalLoop and emits additional instructions to
+  /// turn it into a workshare loop. In particular, it calls to an OpenMP
+  /// runtime function in the preheader to obtain, and then in each iteration
+  /// to update the loop counter.
+  /// \param Loc      The source location description, the insertion location
+  ///                 is not used.
+  /// \param CLI      A descriptor of the canonical loop to workshare.
+  /// \param AllocaIP An insertion point for Alloca instructions usable in the
+  ///                 preheader of the loop.
+  /// \param NeedsBarrier Indicates whether a barrier must be insterted after
+  ///                     the loop.
+  /// \param Chunk    The size of loop chunk considered as a unit when
+  ///                 scheduling. If \p nullptr, defaults to 1.
+  ///
+  /// \returns Point where to insert code after the loop.
+  InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
+                                           CanonicalLoopInfo *CLI,
+                                           InsertPointTy AllocaIP,
+                                           bool NeedsBarrier,
+                                           Value *Chunk = nullptr);
+
   /// Modifies the canonical loop to be a workshare loop.
   ///
   /// This takes a \p LoopInfo representing a canonical loop, such as the one

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ec9ecced08e74..de93f644818f8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1168,10 +1168,8 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
 
   Value *ThreadNum = getOrCreateThreadID(SrcLoc);
 
-  // TODO: extract scheduling type and map it to OMP constant. This is curently
-  // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
-  constexpr int StaticSchedType = 34;
-  Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
+  Constant *SchedulingType =
+      ConstantInt::get(I32Type, static_cast<int>(OMPScheduleType::Static));
 
   // Call the "init" function and update the trip count of the loop with the
   // value it produced.
@@ -1220,6 +1218,148 @@ CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
   return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
 }
 
+/// Returns an LLVM function to call for initializing loop bounds using OpenMP
+/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
+/// the runtime. Always interpret integers as unsigned similarly to
+/// CanonicalLoopInfo.
+static FunctionCallee
+getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
+  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+/// Returns an LLVM function to call for updating the next loop using OpenMP
+/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
+/// the runtime. Always interpret integers as unsigned similarly to
+/// CanonicalLoopInfo.
+static FunctionCallee
+getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
+  unsigned Bitwidth = Ty->getIntegerBitWidth();
+  if (Bitwidth == 32)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
+  if (Bitwidth == 64)
+    return OMPBuilder.getOrCreateRuntimeFunction(
+        M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
+  llvm_unreachable("unknown OpenMP loop iterator bitwidth");
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
+    const LocationDescription &Loc, CanonicalLoopInfo *CLI,
+    InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
+  // Set up the source location value for OpenMP runtime.
+  Builder.SetCurrentDebugLocation(Loc.DL);
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+
+  // Declare useful OpenMP runtime functions.
+  Value *IV = CLI->getIndVar();
+  Type *IVTy = IV->getType();
+  FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
+  FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
+
+  // Allocate space for computed loop bounds as expected by the "init" function.
+  Builder.restoreIP(AllocaIP);
+  Type *I32Type = Type::getInt32Ty(M.getContext());
+  Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
+  Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
+  Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
+  Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
+
+  // At the end of the preheader, prepare for calling the "init" function by
+  // storing the current loop bounds into the allocated space. A canonical loop
+  // always iterates from 0 to trip-count with step 1. Note that "init" expects
+  // and produces an inclusive upper bound.
+  BasicBlock *PreHeader = CLI->getPreheader();
+  Builder.SetInsertPoint(PreHeader->getTerminator());
+  Constant *One = ConstantInt::get(IVTy, 1);
+  Builder.CreateStore(One, PLowerBound);
+  Value *UpperBound = CLI->getTripCount();
+  Builder.CreateStore(UpperBound, PUpperBound);
+  Builder.CreateStore(One, PStride);
+
+  BasicBlock *Header = CLI->getHeader();
+  BasicBlock *Exit = CLI->getExit();
+  BasicBlock *Cond = CLI->getCond();
+  InsertPointTy AfterIP = CLI->getAfterIP();
+
+  // The CLI will be "broken" in the code below, as the loop is no longer
+  // a valid canonical loop.
+
+  if (!Chunk)
+    Chunk = One;
+
+  Value *ThreadNum = getOrCreateThreadID(SrcLoc);
+
+  OMPScheduleType DynamicSchedType =
+      OMPScheduleType::DynamicChunked | OMPScheduleType::ModifierNonmonotonic;
+  Constant *SchedulingType =
+      ConstantInt::get(I32Type, static_cast<int>(DynamicSchedType));
+
+  // Call the "init" function.
+  Builder.CreateCall(DynamicInit,
+                     {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
+                      UpperBound, /* step */ One, Chunk});
+
+  // An outer loop around the existing one.
+  BasicBlock *OuterCond = BasicBlock::Create(
+      PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
+      PreHeader->getParent());
+  // This needs to be 32-bit always, so can't use the IVTy Zero above.
+  Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
+  Value *Res =
+      Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
+                                       PLowerBound, PUpperBound, PStride});
+  Constant *Zero32 = ConstantInt::get(I32Type, 0);
+  Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
+  Value *LowerBound =
+      Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
+  Builder.CreateCondBr(MoreWork, Header, Exit);
+
+  // Change PHI-node in loop header to use outer cond rather than preheader,
+  // and set IV to the LowerBound.
+  Instruction *Phi = &Header->front();
+  auto *PI = cast<PHINode>(Phi);
+  PI->setIncomingBlock(0, OuterCond);
+  PI->setIncomingValue(0, LowerBound);
+
+  // Then set the pre-header to jump to the OuterCond
+  Instruction *Term = PreHeader->getTerminator();
+  auto *Br = cast<BranchInst>(Term);
+  Br->setSuccessor(0, OuterCond);
+
+  // Modify the inner condition:
+  // * Use the UpperBound returned from the DynamicNext call.
+  // * jump to the loop outer loop when done with one of the inner loops.
+  Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
+  UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
+  Instruction *Comp = &*Builder.GetInsertPoint();
+  auto *CI = cast<CmpInst>(Comp);
+  CI->setOperand(1, UpperBound);
+  // Redirect the inner exit to branch to outer condition.
+  Instruction *Branch = &Cond->back();
+  auto *BI = cast<BranchInst>(Branch);
+  assert(BI->getSuccessor(1) == Exit);
+  BI->setSuccessor(1, OuterCond);
+
+  // Add the barrier if requested.
+  if (NeedsBarrier) {
+    Builder.SetInsertPoint(&Exit->back());
+    createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                  omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
+                  /* CheckCancelFlag */ false);
+  }
+
+  return AfterIP;
+}
+
 /// Make \p Source branch to \p Target.
 ///
 /// Handles two situations:
@@ -1901,7 +2041,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
   llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
 
   Function *Fn =
-  		getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
 
   return Builder.CreateCall(Fn, Args);
 }

diff  --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index da81367180492..77913e6711303 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -1708,6 +1708,105 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
   EXPECT_EQ(NumCallsInExitBlock, 3u);
 }
 
+TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  IRBuilder<> Builder(BB);
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  Type *LCTy = Type::getInt32Ty(Ctx);
+  Value *StartVal = ConstantInt::get(LCTy, 10);
+  Value *StopVal = ConstantInt::get(LCTy, 52);
+  Value *StepVal = ConstantInt::get(LCTy, 2);
+  Value *ChunkVal = ConstantInt::get(LCTy, 7);
+  auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
+
+  CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
+      Loc, LoopBodyGen, StartVal, StopVal, StepVal,
+      /*IsSigned=*/false, /*InclusiveStop=*/false);
+
+  Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+  InsertPointTy AllocaIP = Builder.saveIP();
+
+  // Collect all the info from CLI, as it isn't usable after the call to
+  // createDynamicWorkshareLoop.
+  InsertPointTy AfterIP = CLI->getAfterIP();
+  BasicBlock *Preheader = CLI->getPreheader();
+  BasicBlock *ExitBlock = CLI->getExit();
+  Value *IV = CLI->getIndVar();
+
+  InsertPointTy EndIP =
+      OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP,
+                                            /*NeedsBarrier=*/true, ChunkVal);
+  // The returned value should be the "after" point.
+  ASSERT_EQ(EndIP.getBlock(), AfterIP.getBlock());
+  ASSERT_EQ(EndIP.getPoint(), AfterIP.getPoint());
+
+  auto AllocaIter = BB->begin();
+  ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
+  AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
+  AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
+  AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
+  AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
+  EXPECT_NE(PLastIter, nullptr);
+  EXPECT_NE(PLowerBound, nullptr);
+  EXPECT_NE(PUpperBound, nullptr);
+  EXPECT_NE(PStride, nullptr);
+
+  auto PreheaderIter = Preheader->begin();
+  ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 6);
+  StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+  StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+  StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
+  ASSERT_NE(LowerBoundStore, nullptr);
+  ASSERT_NE(UpperBoundStore, nullptr);
+  ASSERT_NE(StrideStore, nullptr);
+
+  CallInst *ThreadIdCall = dyn_cast<CallInst>(&*(PreheaderIter++));
+  ASSERT_NE(ThreadIdCall, nullptr);
+  EXPECT_EQ(ThreadIdCall->getCalledFunction()->getName(),
+            "__kmpc_global_thread_num");
+
+  CallInst *InitCall = dyn_cast<CallInst>(&*PreheaderIter);
+
+  ASSERT_NE(InitCall, nullptr);
+  EXPECT_EQ(InitCall->getCalledFunction()->getName(),
+            "__kmpc_dispatch_init_4u");
+  EXPECT_EQ(InitCall->getNumArgOperands(), 7U);
+  EXPECT_EQ(InitCall->getArgOperand(6),
+            ConstantInt::get(Type::getInt32Ty(Ctx), 7));
+
+  ConstantInt *OrigLowerBound =
+      dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
+  ConstantInt *OrigUpperBound =
+      dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
+  ConstantInt *OrigStride =
+      dyn_cast<ConstantInt>(StrideStore->getValueOperand());
+  ASSERT_NE(OrigLowerBound, nullptr);
+  ASSERT_NE(OrigUpperBound, nullptr);
+  ASSERT_NE(OrigStride, nullptr);
+  EXPECT_EQ(OrigLowerBound->getValue(), 1);
+  EXPECT_EQ(OrigUpperBound->getValue(), 21);
+  EXPECT_EQ(OrigStride->getValue(), 1);
+
+  // The original loop iterator should only be used in the condition, in the
+  // increment and in the statement that adds the lower bound to it.
+  EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
+
+  // The exit block should contain the barrier call, plus the call to obtain
+  // the thread ID.
+  size_t NumCallsInExitBlock =
+      count_if(*ExitBlock, [](Instruction &I) { return isa<CallInst>(I); });
+  EXPECT_EQ(NumCallsInExitBlock, 2u);
+
+  // Add a termination to our block and check that it is internally consistent.
+  Builder.restoreIP(EndIP);
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+}
+
 TEST_F(OpenMPIRBuilderTest, MasterDirective) {
   using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
   OpenMPIRBuilder OMPBuilder(*M);

diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e7e9cb9590521..3cd201ad08a5c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -179,11 +179,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   if (loop.getNumLoops() != 1)
     return opInst.emitOpError("collapsed loops not yet supported");
 
-  if (loop.schedule_val().hasValue() &&
-      omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) !=
-          omp::ClauseScheduleKind::Static)
-    return opInst.emitOpError(
-        "only static (default) loop schedule is currently supported");
+  bool isStatic = true;
+
+  if (loop.schedule_val().hasValue()) {
+    auto schedule =
+        omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
+    if (schedule != omp::ClauseScheduleKind::Static &&
+        schedule != omp::ClauseScheduleKind::Dynamic)
+      return opInst.emitOpError("only static (default) and dynamic loop "
+                                "schedule is currently supported");
+    isStatic = (schedule == omp::ClauseScheduleKind::Static);
+  }
 
   // Find the loop configuration.
   llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]);
@@ -241,11 +247,19 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   // Put them at the start of the current block for now.
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP(
       insertBlock, insertBlock->getFirstInsertionPt());
-  loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop(
-      ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk);
+  llvm::OpenMPIRBuilder::InsertPointTy afterIP;
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  if (isStatic) {
+    loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP,
+                                                     !loop.nowait(), chunk);
+    afterIP = loopInfo->getAfterIP();
+  } else {
+    afterIP = ompBuilder->createDynamicWorkshareLoop(ompLoc, loopInfo, allocaIP,
+                                                     !loop.nowait(), chunk);
+  }
 
   // Continue building IR after the loop.
-  builder.restoreIP(loopInfo->getAfterIP());
+  builder.restoreIP(afterIP);
   return success();
 }
 


        


More information about the llvm-commits mailing list