[llvm] af7e158 - [LV] Vectorizer should adjust trip count in profile information

Evgeniy Brevnov via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 20 03:36:41 PST 2020


Author: Evgeniy Brevnov
Date: 2020-01-20T18:36:28+07:00
New Revision: af7e1588727c691ae07e286c94dbcbf31060e876

URL: https://github.com/llvm/llvm-project/commit/af7e1588727c691ae07e286c94dbcbf31060e876
DIFF: https://github.com/llvm/llvm-project/commit/af7e1588727c691ae07e286c94dbcbf31060e876.diff

LOG: [LV] Vectorizer should adjust trip count in profile information

Summary: Vectorized loop processes VFxUF number of elements in one iteration thus total number of iterations decreases proportionally. In addition epilog loop may not have more than VFxUF - 1 iterations. This patch updates profile information accordingly.

Reviewers: hsaito, Ayal, fhahn, reames, silvas, dcaballe, SjoerdMeijer, mkuper, DaniilSuchkov

Reviewed By: Ayal, DaniilSuchkov

Subscribers: fedor.sergeev, hiraditya, rkruppe, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D67905

Added: 
    llvm/test/Transforms/LoopVectorize/check-prof-info.ll

Modified: 
    llvm/include/llvm/Transforms/Utils/LoopUtils.h
    llvm/lib/Transforms/Utils/LoopUtils.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/tripcount.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 576a7e8d43e8..3b15f6379211 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -262,10 +262,22 @@ TransformationMode hasLICMVersioningTransformation(Loop *L);
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
 
-/// Get a loop's estimated trip count based on branch weight metadata.
+/// Returns a loop's estimated trip count based on branch weight metadata.
+/// In addition if \p EstimatedLoopInvocationWeight is not null it is
+/// initialized with weight of loop's latch leading to the exit.
 /// Returns 0 when the count is estimated to be 0, or None when a meaningful
 /// estimate can not be made.
-Optional<unsigned> getLoopEstimatedTripCount(Loop *L);
+Optional<unsigned>
+getLoopEstimatedTripCount(Loop *L,
+                          unsigned *EstimatedLoopInvocationWeight = nullptr);
+
+/// Set a loop's branch weight metadata to reflect that loop has \p
+/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits
+/// through latch. Returns true if metadata is successfully updated, false
+/// otherwise. Note that loop must have a latch block which controls loop exit
+/// in order to succeed.
+bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
+                               unsigned EstimatedLoopInvocationWeight);
 
 /// Check inner loop (L) backedge count is known to be invariant on all
 /// iterations of its outer loop. If the loop has no parent, this is trivially
@@ -370,6 +382,23 @@ int rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
                           DominatorTree *DT, ReplaceExitVal ReplaceExitValue,
                           SmallVector<WeakTrackingVH, 16> &DeadInsts);
 
+/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
+/// \p OrigLoop and the following distribution of \p OrigLoop iteration among \p
+/// UnrolledLoop and \p RemainderLoop. \p UnrolledLoop receives weights that
+/// reflect TC/UF iterations, and \p RemainderLoop receives weights that reflect
+/// the remaining TC%UF iterations.
+///
+/// Note that \p OrigLoop may be equal to either \p UnrolledLoop or \p
+/// RemainderLoop in which case weights for \p OrigLoop are updated accordingly.
+/// Note also behavior is undefined if \p UnrolledLoop and \p RemainderLoop are
+/// equal. \p UF must be greater than zero.
+/// If \p OrigLoop has no profile info associated nothing happens.
+///
+/// This utility may be useful for such optimizations as unroller and
+/// vectorizer as it's typical transformation for them.
+void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
+                                  Loop *RemainderLoop, uint64_t UF);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H

diff  --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c9de4340cb29..88b0f8eff27b 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
@@ -690,17 +691,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   }
 }
 
-Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
-  // Support loops with an exiting latch and other existing exists only
-  // deoptimize.
-
-  // Get the branch weights for the loop's backedge.
+/// Checks if \p L has single exit through latch block except possibly
+/// "deoptimizing" exits. Returns branch instruction terminating the loop
+/// latch if above check is successful, nullptr otherwise.
+static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
-    return None;
+    return nullptr;
+
   BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
   if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
-    return None;
+    return nullptr;
 
   assert((LatchBR->getSuccessor(0) == L->getHeader() ||
           LatchBR->getSuccessor(1) == L->getHeader()) &&
@@ -711,21 +712,36 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   if (any_of(ExitBlocks, [](const BasicBlock *EB) {
         return !EB->getTerminatingDeoptimizeCall();
       }))
+    return nullptr;
+
+  return LatchBR;
+}
+
+Optional<unsigned>
+llvm::getLoopEstimatedTripCount(Loop *L,
+                                unsigned *EstimatedLoopInvocationWeight) {
+  // Support loops with an exiting latch and other existing exists only
+  // deoptimize.
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
     return None;
 
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
   uint64_t BackedgeTakenWeight, LatchExitWeight;
-  if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
+  if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
     return None;
 
-  if (LatchBR->getSuccessor(0) != L->getHeader())
+  if (LatchBranch->getSuccessor(0) != L->getHeader())
     std::swap(BackedgeTakenWeight, LatchExitWeight);
 
   if (!LatchExitWeight)
     return None;
 
+  if (EstimatedLoopInvocationWeight)
+    *EstimatedLoopInvocationWeight = LatchExitWeight;
+
   // Estimated backedge taken count is a ratio of the backedge taken weight by
   // the weight of the edge exiting the loop, rounded to nearest.
   uint64_t BackedgeTakenCount =
@@ -734,6 +750,37 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   return BackedgeTakenCount + 1;
 }
 
+bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
+                                     unsigned EstimatedloopInvocationWeight) {
+  // Support loops with an exiting latch and other existing exists only
+  // deoptimize.
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return false;
+
+  // Calculate taken and exit weights.
+  unsigned LatchExitWeight = 0;
+  unsigned BackedgeTakenWeight = 0;
+
+  if (EstimatedTripCount > 0) {
+    LatchExitWeight = EstimatedloopInvocationWeight;
+    BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
+  }
+
+  // Make a swap if back edge is taken when condition is "false".
+  if (LatchBranch->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  MDBuilder MDB(LatchBranch->getContext());
+
+  // Set/Update profile metadata.
+  LatchBranch->setMetadata(
+      LLVMContext::MD_prof,
+      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+
+  return true;
+}
+
 bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
                                               ScalarEvolution &SE) {
   Loop *OuterL = InnerLoop->getParentLoop();
@@ -1351,3 +1398,29 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI,
   Rewriter.clearInsertPoint();
   return NumReplaced;
 }
+
+/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
+/// \p OrigLoop.
+void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
+                                        Loop *RemainderLoop, uint64_t UF) {
+  assert(UF > 0 && "Zero unrolled factor is not supported");
+  assert(UnrolledLoop != RemainderLoop &&
+         "Unrolled and Remainder loops are expected to distinct");
+
+  // Get number of iterations in the original scalar loop.
+  unsigned OrigLoopInvocationWeight = 0;
+  Optional<unsigned> OrigAverageTripCount =
+      getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
+  if (!OrigAverageTripCount)
+    return;
+
+  // Calculate number of iterations in unrolled loop.
+  unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
+  // Calculate number of iterations for remainder loop.
+  unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
+
+  setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
+                            OrigLoopInvocationWeight);
+  setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
+                            OrigLoopInvocationWeight);
+}

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index af42e00f0b74..b1650713d546 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3483,6 +3483,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
 
   // Remove redundant induction instructions.
   cse(LoopVectorBody);
+
+  // Set/update profile weights for the vector and remainder loops as original
+  // loop iterations are now distributed among them. Note that original loop
+  // represented by LoopScalarBody becomes remainder loop after vectorization.
+  //
+  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+  // end up getting slightly roughened result but that should be OK since
+  // profile is not inherently precise anyway. Note also possible bypass of
+  // vector code caused by legality checks is ignored, assigning all the weight
+  // to the vector loop, optimistically.
+  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
+                               LI->getLoopFor(LoopVectorBody),
+                               LI->getLoopFor(LoopScalarBody), VF * UF);
 }
 
 void InnerLoopVectorizer::fixCrossIterationPHIs() {

diff  --git a/llvm/test/Transforms/LoopVectorize/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
new file mode 100644
index 000000000000..50b64d86c230
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/check-prof-info.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt  -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=1 -S < %s |  FileCheck %s
+; RUN: opt  -passes="print<block-freq>,loop-vectorize" -force-vector-width=4 -force-vector-interleave=4 -S < %s |  FileCheck %s -check-prefix=CHECK-MASKED
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at a = dso_local global [1024 x i32] zeroinitializer, align 16
+ at b = dso_local global [1024 x i32] zeroinitializer, align 16
+
+; Check correctness of profile info for vectorization without epilog.
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @_Z3foov() local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3foov(
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
+; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
+; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
+; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %1 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
+  %add = add nsw i32 %2, %mul
+  store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !6
+}
+
+; Check correctness of profile info for vectorization with epilog.
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
+; CHECK-LABEL: @_Z3foo2v(
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
+; CHECK-MASKED:  [[VECTOR_BODY:vector\.body]]:
+; CHECK-MASKED:    br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
+; CHECK-MASKED:  [[FOR_BODY:for\.body]]:
+; CHECK-MASKED:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %1 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %0, %1
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx2, align 4, !tbaa !2
+  %add = add nsw i32 %2, %mul
+  store i32 %add, i32* %arrayidx2, align 4, !tbaa !2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1027
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !7
+}
+
+attributes #0 = { "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
+; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = !{!"branch_weights", i32 1, i32 1023}
+!7 = !{!"branch_weights", i32 1, i32 1026}

diff  --git a/llvm/test/Transforms/LoopVectorize/tripcount.ll b/llvm/test/Transforms/LoopVectorize/tripcount.ll
index 56f8b3e83c7d..e19f4aa85c02 100644
--- a/llvm/test/Transforms/LoopVectorize/tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/tripcount.ll
@@ -61,8 +61,10 @@ define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
 ; but has a high trip count per invocation. Vectorize it.
 
 ; CHECK-LABEL: @foo_low_trip_count3(
-; CHECK: vector.body:
-
+; CHECK:  [[VECTOR_BODY:vector\.body]]:
+; CHECK:    br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
+; CHECK:  [[FOR_BODY:for\.body]]:
+; CHECK:    br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
 entry:
   br i1 %cond, label %for.preheader, label %for.end, !prof !2
 
@@ -205,6 +207,15 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
+; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490}
+; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0}
+; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000,
+; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001.
+; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1
+; for vectorized and remainder loops, respectively, therefore their
+; estimatedBackedgeTakenCounts are 249 and 0, and so the weights recorded with
+; loop invocation weights of 10 are the above {10, 2490} and {10, 0}.
+
 !0 = !{!"function_entry_count", i64 100}
 !1 = !{!"branch_weights", i32 100, i32 0}
 !2 = !{!"branch_weights", i32 10, i32 90}


        


More information about the llvm-commits mailing list