[llvm] 3c0f347 - [NFC][LV] Vectorized Loop Skeleton Refactoring

Tue Aug 4 11:51:34 PDT 2020

Author: Bardia Mahjour
Date: 2020-08-04T14:50:57-04:00
New Revision: 3c0f34700230fc4fd23ef408adb75387dcfeff41

URL: https://github.com/llvm/llvm-project/commit/3c0f34700230fc4fd23ef408adb75387dcfeff41
DIFF: https://github.com/llvm/llvm-project/commit/3c0f34700230fc4fd23ef408adb75387dcfeff41.diff

LOG: [NFC][LV] Vectorized Loop Skeleton Refactoring

This patch tries to improve readability and maintenance
of createVectorizedLoopSkeleton by reorganizing some lines,
updating some of the comments and breaking it up into
smaller logical units.

Reviewed By: pjeeva01

Differential Revision: https://reviews.llvm.org/D83824

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33bd31f6b983..7bf846d2a617 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -410,8 +410,11 @@ class InnerLoopVectorizer {
 
   virtual ~InnerLoopVectorizer() = default;
 
-  /// Create a new empty loop. Unlink the old loop and connect the new one.
-  /// Return the pre-header block of the new loop.
+  /// Create a new empty loop that will contain vectorized instructions later
+  /// on, while the old loop will be used as the scalar remainder. Control flow
+  /// is generated around the vectorized (and scalar epilogue) loops consisting
+  /// of various checks and bypasses. Return the pre-header block of the new
+  /// loop.
   BasicBlock *createVectorizedLoopSkeleton();
 
   /// Widen a single instruction within the innermost loop.
@@ -662,6 +665,22 @@ class InnerLoopVectorizer {
                               const DataLayout &DL,
                               const InductionDescriptor &ID) const;
 
+  /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
+  /// vector loop preheader, middle block and scalar preheader. Also
+  /// allocate a loop object for the new vector loop and return it.
+  Loop *createVectorLoopSkeleton(StringRef Prefix);
+
+  /// Create new phi nodes for the induction variables to resume iteration count
+  /// in the scalar epilogue, from where the vectorized loop left off (given by
+  /// \p VectorTripCount).
+  void createInductionResumeValues(Loop *L, Value *VectorTripCount);
+
+  /// Complete the loop skeleton by adding debug MDs, creating appropriate
+  /// conditional branches in the middle block, preparing the builder and
+  /// running the verifier. Take in the vector loop \p L as argument, and return
+  /// the preheader of the completed vector loop.
+  BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
+
   /// Add additional metadata to \p To that was not present on \p Orig.
   ///
   /// Currently this is used to add the noalias annotations based on the
@@ -2957,56 +2976,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
   llvm_unreachable("invalid enum");
 }
 
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
-  /*
-   In this function we generate a new loop. The new loop will contain
-   the vectorized instructions while the old loop will continue to run the
-   scalar remainder.
-
-       [ ] <-- loop iteration number check.
-    /   |
-   /    v
-  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
-  |  /  |
-  | /   v
-  ||   [ ]     <-- vector pre header.
-  |/    |
-  |     v
-  |    [  ] \
-  |    [  ]_|   <-- vector loop.
-  |     |
-  |     v
-  |   -[ ]   <--- middle-block.
-  |  /  |
-  | /   v
-  -|- >[ ]     <--- new preheader.
-   |    |
-   |    v
-   |   [ ] \
-   |   [ ]_|   <-- old scalar loop to handle remainder.
-    \   |
-     \  v
-      >[ ]     <-- exit block.
-   ...
-   */
-
-  MDNode *OrigLoopID = OrigLoop->getLoopID();
-
-  // Some loops have a single integer induction variable, while other loops
-  // don't. One example is c++ iterators that often have multiple pointer
-  // induction variables. In the code below we also support a case where we
-  // don't have a single induction variable.
-  //
-  // We try to obtain an induction variable from the original loop as hard
-  // as possible. However if we don't find one that:
-  //   - is an integer
-  //   - counts from zero, stepping by one
-  //   - is the size of the widest induction variable type
-  // then we create a new one.
-  OldInduction = Legal->getPrimaryInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-
-  // Split the single block loop into the two loop structure described above.
+Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
   LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   LoopExitBlock = OrigLoop->getExitBlock();
@@ -3015,16 +2985,16 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
 
   LoopMiddleBlock =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 LI, nullptr, "middle.block");
+                 LI, nullptr, Twine(Prefix) + "middle.block");
   LoopScalarPreHeader =
       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
-                 nullptr, "scalar.ph");
+                 nullptr, Twine(Prefix) + "scalar.ph");
   // We intentionally don't let SplitBlock to update LoopInfo since
   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
   // LoopVectorBody is explicitly added to the correct place few lines later.
   LoopVectorBody =
       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
-                 nullptr, nullptr, "vector.body");
+                 nullptr, nullptr, Twine(Prefix) + "vector.body");
 
   // Update dominator for loop exit.
   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
@@ -3041,37 +3011,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     LI->addTopLevelLoop(Lp);
   }
   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
+  return Lp;
+}
 
-  // Find the loop boundaries.
-  Value *Count = getOrCreateTripCount(Lp);
-
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
-  // Now, compare the new count to zero. If it is zero skip the vector loop and
-  // jump to the scalar loop. This check also covers the case where the
-  // backedge-taken count is uint##_max: adding one to it will overflow leading
-  // to an incorrect trip count of zero. In this (rare) case we will also jump
-  // to the scalar loop.
-  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
-
-  // Generate the code to check any assumptions that we've made for SCEV
-  // expressions.
-  emitSCEVChecks(Lp, LoopScalarPreHeader);
-
-  // Generate the code that checks in runtime if arrays overlap. We put the
-  // checks into a separate block to make the more common case of few elements
-  // faster.
-  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
-
-  // Generate the induction variable.
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
-
+void InnerLoopVectorizer::createInductionResumeValues(Loop *L,
+                                                      Value *VectorTripCount) {
+  assert(VectorTripCount && L && "Expected valid arguments");
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3079,10 +3024,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // iteration in the vectorized loop.
   // If we come from a bypass edge then we need to start from the original
   // start value.
-
-  // This variable saves the new starting index for the scalar loop. It is used
-  // to test if there are any tail iterations left once the vector loop has
-  // completed.
   for (auto &InductionEntry : Legal->getInductionVars()) {
     PHINode *OrigPhi = InductionEntry.first;
     InductionDescriptor II = InductionEntry.second;
@@ -3096,13 +3037,13 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     Value *&EndValue = IVEndValues[OrigPhi];
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
-      EndValue = CountRoundDown;
+      EndValue = VectorTripCount;
     } else {
-      IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
+      IRBuilder<> B(L->getLoopPreheader()->getTerminator());
       Type *StepType = II.getStep()->getType();
       Instruction::CastOps CastOp =
-          CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
-      Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
+          CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
+      Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
       EndValue->setName("ind.end");
@@ -3119,6 +3060,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
       BCResumeVal->addIncoming(II.getStartValue(), BB);
     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
   }
+}
+
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
+                                                      MDNode *OrigLoopID) {
+  assert(L && "Expected valid loop.");
+
+  // The trip counts should be cached by now.
+  Value *Count = getOrCreateTripCount(L);
+  Value *VectorTripCount = getOrCreateVectorTripCount(L);
 
   // We need the OrigLoop (scalar loop part) latch terminator to help
   // produce correct debug info for the middle block BB instructions.
@@ -3136,7 +3086,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   Value *CmpN = Builder.getTrue();
   if (!Cost->foldTailByMasking()) {
     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                           CountRoundDown, "cmp.n",
+                           VectorTripCount, "cmp.n",
                            LoopMiddleBlock->getTerminator());
 
     // Here we use the same DebugLoc as the scalar loop latch branch instead
@@ -3152,7 +3102,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
 
   // Get ready to start creating new instructions into the vectorized body.
-  assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
+  assert(LoopVectorPreHeader == L->getLoopPreheader() &&
          "Inconsistent vector loop preheader");
   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
 
@@ -3160,7 +3110,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
                                       LLVMLoopVectorizeFollowupVectorized});
   if (VectorizedLoopID.hasValue()) {
-    Lp->setLoopID(VectorizedLoopID.getValue());
+    L->setLoopID(VectorizedLoopID.getValue());
 
     // Do not setAlreadyVectorized if loop attributes have been defined
     // explicitly.
@@ -3170,9 +3120,9 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // Keep all loop hints from the original loop on the vector loop (we'll
   // replace the vectorizer-specific hints below).
   if (MDNode *LID = OrigLoop->getLoopID())
-    Lp->setLoopID(LID);
+    L->setLoopID(LID);
 
-  LoopVectorizeHints Hints(Lp, true, *ORE);
+  LoopVectorizeHints Hints(L, true, *ORE);
   Hints.setAlreadyVectorized();
 
 #ifdef EXPENSIVE_CHECKS
@@ -3183,6 +3133,90 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   return LoopVectorPreHeader;
 }
 
+BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+       [ ] <-- loop iteration number check.
+    /   |
+   /    v
+  |    [ ] <-- vector loop bypass (may consist of multiple blocks).
+  |  /  |
+  | /   v
+  ||   [ ]     <-- vector pre header.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
+  |  /  |
+  | /   v
+  -|- >[ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
+   ...
+   */
+
+  // Get the metadata of the original loop before it gets modified.
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+
+  // Create an empty vector loop, and prepare basic blocks for the runtime
+  // checks.
+  Loop *Lp = createVectorLoopSkeleton("");
+
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop. This check also covers the case where the
+  // backedge-taken count is uint##_max: adding one to it will overflow leading
+  // to an incorrect trip count of zero. In this (rare) case we will also jump
+  // to the scalar loop.
+  emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
+
+  // Generate the code to check any assumptions that we've made for SCEV
+  // expressions.
+  emitSCEVChecks(Lp, LoopScalarPreHeader);
+
+  // Generate the code that checks in runtime if arrays overlap. We put the
+  // checks into a separate block to make the more common case of few elements
+  // faster.
+  emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
+
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
+  //
+  // We try to obtain an induction variable from the original loop as hard
+  // as possible. However if we don't find one that:
+  //   - is an integer
+  //   - counts from zero, stepping by one
+  //   - is the size of the widest induction variable type
+  // then we create a new one.
+  OldInduction = Legal->getPrimaryInduction();
+  Type *IdxTy = Legal->getWidestInductionType();
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  // The loop step is equal to the vectorization factor (num of SIMD elements)
+  // times the unroll factor (num of SIMD instructions).
+  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
+  Induction =
+      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
+                              getDebugLocFromInstOrOperands(OldInduction));
+
+  // Emit phis for the new starting index of the scalar loop.
+  createInductionResumeValues(Lp, CountRoundDown);
+
+  return completeLoopSkeleton(Lp, OrigLoopID);
+}
+
 // Fix up external users of the induction variable. At this point, we are
 // in LCSSA form, with all external PHIs that use the IV having one input value,
 // coming from the remainder loop. We need those PHIs to also have a correct