[llvm] d7e16ca - [LV] Interleave to expose ILP for small loops with scalar reductions.

Tue Sep 1 12:49:26 PDT 2020

Author: Aaron Liu
Date: 2020-09-01T19:47:32Z
New Revision: d7e16ca28f48000d4fb3e3388d782cbd9ad02e62

URL: https://github.com/llvm/llvm-project/commit/d7e16ca28f48000d4fb3e3388d782cbd9ad02e62
DIFF: https://github.com/llvm/llvm-project/commit/d7e16ca28f48000d4fb3e3388d782cbd9ad02e62.diff

LOG: [LV] Interleave to expose ILP for small loops with scalar reductions.

Interleave for small loops that have reductions inside,
which breaks dependencies and expose.

This gives very significant performance improvements for some benchmarks.
Because small loops could be in very hot functions in real applications.

Differential Revision: https://reviews.llvm.org/D81416

Added: 
    llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6aa520f6e8ec..b9f7ae71d0cf 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -270,6 +270,12 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
+/// Interleave small loops with scalar reductions.
+static cl::opt<bool> InterleaveSmallLoopScalarReduction(
+    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
+    cl::desc("Enable interleaving for loops with small iteration counts that "
+             "contain scalar reductions to expose ILP."));
+
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -5519,10 +5525,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
   if (Legal->getMaxSafeDepDistBytes() != -1U)
     return 1;
 
-  // Do not interleave loops with a relatively small known or estimated trip
-  // count.
   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
-  if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
+  const bool HasReductions = !Legal->getReductionVars().empty();
+  // Do not interleave loops with a relatively small known or estimated trip
+  // count. But we will interleave when InterleaveSmallLoopScalarReduction is
+  // enabled, and the code has scalar reductions(HasReductions && VF = 1),
+  // because with the above conditions interleaving can expose ILP and break
+  // cross iteration dependences for reductions.
+  if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
+      !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
     return 1;
 
   RegisterUsage R = calculateRegisterUsage({VF})[0];
@@ -5550,7 +5561,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
                       << " registers of "
                       << TTI.getRegisterClassName(pair.first) << " register class\n");
-    if (VF == 1) {
+    if (VF.isScalar()) {
       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
         TargetNumRegisters = ForceTargetNumScalarRegs;
     } else {
@@ -5579,7 +5590,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
 
   // Check if the user has overridden the max.
-  if (VF == 1) {
+  if (VF.isScalar()) {
     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
   } else {
@@ -5610,7 +5621,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF.isVector() && !Legal->getReductionVars().empty()) {
+  if (VF.isVector() && HasReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -5622,7 +5633,11 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
-  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
+                    << "LV: IC is " << IC << '\n'
+                    << "LV: VF is " << VF.getKnownMinValue() << '\n');
+  const bool AggressivelyInterleaveReductions =
+      TTI.enableAggressiveInterleaving(HasReductions);
   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
@@ -5641,7 +5656,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
+    if (HasReductions && TheLoop->getLoopDepth() > 1) {
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -5655,14 +5670,23 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
       return std::max(StoresIC, LoadsIC);
     }
 
-    LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
-    return SmallIC;
+    // If there are scalar reductions and TTI has enabled aggressive
+    // interleaving for reductions, we will interleave to expose ILP.
+    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
+        AggressivelyInterleaveReductions) {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+      // Interleave no less than SmallIC but not as aggressive as the normal IC
+      // to satisfy the rare situation when resources are too limited.
+      return std::max(IC / 2, SmallIC);
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
+      return SmallIC;
+    }
   }
 
   // Interleave if this is a large loop (small loops are already dealt with by
   // this point) that could benefit from interleaving.
-  bool HasReductions = !Legal->getReductionVars().empty();
-  if (TTI.enableAggressiveInterleaving(HasReductions)) {
+  if (AggressivelyInterleaveReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
   }

diff  --git a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
new file mode 100644
index 000000000000..a664975184c5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+
+; CHECK-LABEL: vector.body
+; CHECK: load double, double*
+; CHECK-NEXT: load double, double*
+; CHECK-NEXT: load double, double*
+; CHECK-NEXT: load double, double*
+
+; CHECK: fmul fast double
+; CHECK-NEXT: fmul fast double
+; CHECK-NEXT: fmul fast double
+; CHECK-NEXT: fmul fast double
+
+; CHECK: fadd fast double
+; CHECK-NEXT: fadd fast double
+; CHECK-NEXT: fadd fast double
+; CHECK-NEXT: fadd fast double
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define dso_local void @test(i32*** %arg, double** %arg1) align 2 {
+bb:
+  %tpm15 = load i32**, i32*** %arg, align 8
+  %tpm19 = load double*, double** %arg1, align 8
+  br label %bb22
+bb22:                                             ; preds = %bb33, %bb
+  %tpm26 = add i64 0, 1
+  %tpm10 = alloca i32, align 8
+  %tpm27 = getelementptr inbounds i32, i32* %tpm10, i64 %tpm26
+  %tpm28 = getelementptr inbounds i32*, i32** %tpm15, i64 0
+  %tpm29 = load i32*, i32** %tpm28, align 8
+  %tpm17 = alloca double, align 8
+  %tpm32 = getelementptr inbounds double, double* %tpm17, i64 %tpm26
+  br label %bb40
+bb33:                                             ; preds = %bb40
+  %tpm35 = getelementptr inbounds double, double* %tpm19, i64 0
+  %tpm37 = fsub fast double 0.000000e+00, %tpm50
+  store double %tpm37, double* %tpm35, align 8
+  br label %bb22
+bb40:                                             ; preds = %bb40, %bb22
+  %tpm41 = phi i32* [ %tpm51, %bb40 ], [ %tpm27, %bb22 ]
+  %tpm42 = phi double* [ %tpm52, %bb40 ], [ %tpm32, %bb22 ]
+  %tpm43 = phi double [ %tpm50, %bb40 ], [ 0.000000e+00, %bb22 ]
+  %tpm44 = load double, double* %tpm42, align 8
+  %tpm45 = load i32, i32* %tpm41, align 4
+  %tpm46 = zext i32 %tpm45 to i64
+  %tpm47 = getelementptr inbounds double, double* %tpm19, i64 %tpm46
+  %tpm48 = load double, double* %tpm47, align 8
+  %tpm49 = fmul fast double %tpm48, %tpm44
+  %tpm50 = fadd fast double %tpm49, %tpm43
+  %tpm51 = getelementptr inbounds i32, i32* %tpm41, i64 1
+  %tpm52 = getelementptr inbounds double, double* %tpm42, i64 1
+  %tpm53 = icmp eq i32* %tpm51, %tpm29
+  br i1 %tpm53, label %bb33, label %bb40
+}