[llvm] 961f51f - [LoopVectorize][CostModel] Choose smaller VFs for in-loop reductions without loads/stores

Tue Jan 4 02:26:07 PST 2022

Author: Rosie Sumpter
Date: 2022-01-04T10:12:57Z
New Revision: 961f51fdf04fd14f5dc5e7a6d53a5460249d947c

URL: https://github.com/llvm/llvm-project/commit/961f51fdf04fd14f5dc5e7a6d53a5460249d947c
DIFF: https://github.com/llvm/llvm-project/commit/961f51fdf04fd14f5dc5e7a6d53a5460249d947c.diff

LOG: [LoopVectorize][CostModel] Choose smaller VFs for in-loop reductions without loads/stores

For loops that contain in-loop reductions but no loads or stores, large
VFs are chosen because LoopVectorizationCostModel::getSmallestAndWidestTypes
has no element types to check through and so returns the default widths
(-1U for the smallest and 8 for the widest). This results in the widest
VF being chosen for the following example,

float s = 0;
for (int i = 0; i < N; ++i)
  s += (float) i*i;

which, for more computationally intensive loops, leads to large loop
sizes when the operations end up being scalarized.

In this patch, for the case where ElementTypesInLoop is empty, the widest
type is determined by finding the smallest type used by recurrences in
the loop instead of falling back to a default value of 8 bits. This
results in the cost model choosing a more sensible VF for loops like
the one above.

Differential Revision: https://reviews.llvm.org/D113973

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/IVDescriptors.h
    llvm/lib/Analysis/IVDescriptors.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
    llvm/test/Transforms/LoopVectorize/X86/funclet.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 9858a46d16a26..dec488a6f26db 100644

--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -77,10 +77,12 @@ class RecurrenceDescriptor {
   RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K,
                        FastMathFlags FMF, Instruction *ExactFP, Type *RT,
                        bool Signed, bool Ordered,
-                       SmallPtrSetImpl<Instruction *> &CI)
+                       SmallPtrSetImpl<Instruction *> &CI,
+                       unsigned MinWidthCastToRecurTy)
       : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF),
         ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed),
-        IsOrdered(Ordered) {
+        IsOrdered(Ordered),
+        MinWidthCastToRecurrenceType(MinWidthCastToRecurTy) {
     CastInsts.insert(CI.begin(), CI.end());
   }
 
@@ -251,6 +253,11 @@ class RecurrenceDescriptor {
   /// recurrence.
   const SmallPtrSet<Instruction *, 8> &getCastInsts() const { return CastInsts; }
 
+  /// Returns the minimum width used by the recurrence in bits.
+  unsigned getMinWidthCastToRecurrenceTypeInBits() const {
+    return MinWidthCastToRecurrenceType;
+  }
+
   /// Returns true if all source operands of the recurrence are SExtInsts.
   bool isSigned() const { return IsSigned; }
 
@@ -291,6 +298,8 @@ class RecurrenceDescriptor {
   bool IsOrdered = false;
   // Instructions used for type-promoting the recurrence.
   SmallPtrSet<Instruction *, 8> CastInsts;
+  // The minimum width used by the recurrence.
+  unsigned MinWidthCastToRecurrenceType;
 };
 
 /// A struct for saving information about induction variables.

diff  --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index f5fa6748d053c..9551eb48e2316 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -161,19 +161,22 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
 
 /// Collect cast instructions that can be ignored in the vectorizer's cost
 /// model, given a reduction exit value and the minimal type in which the
-/// reduction can be represented.
-static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
-                                 Type *RecurrenceType,
-                                 SmallPtrSetImpl<Instruction *> &Casts) {
+// reduction can be represented. Also search casts to the recurrence type
+// to find the minimum width used by the recurrence.
+static void collectCastInstrs(Loop *TheLoop, Instruction *Exit,
+                              Type *RecurrenceType,
+                              SmallPtrSetImpl<Instruction *> &Casts,
+                              unsigned &MinWidthCastToRecurTy) {
 
   SmallVector<Instruction *, 8> Worklist;
   SmallPtrSet<Instruction *, 8> Visited;
   Worklist.push_back(Exit);
+  MinWidthCastToRecurTy = -1U;
 
   while (!Worklist.empty()) {
     Instruction *Val = Worklist.pop_back_val();
     Visited.insert(Val);
-    if (auto *Cast = dyn_cast<CastInst>(Val))
+    if (auto *Cast = dyn_cast<CastInst>(Val)) {
       if (Cast->getSrcTy() == RecurrenceType) {
         // If the source type of a cast instruction is equal to the recurrence
         // type, it will be eliminated, and should be ignored in the vectorizer
@@ -181,7 +184,16 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
         Casts.insert(Cast);
         continue;
       }
-
+      if (Cast->getDestTy() == RecurrenceType) {
+        // The minimum width used by the recurrence is found by checking for
+        // casts on its operands. The minimum width is used by the vectorizer
+        // when finding the widest type for in-loop reductions without any
+        // loads/stores.
+        MinWidthCastToRecurTy = std::min<unsigned>(
+            MinWidthCastToRecurTy, Cast->getSrcTy()->getScalarSizeInBits());
+        continue;
+      }
+    }
     // Add all operands to the work list if they are loop-varying values that
     // we haven't yet visited.
     for (Value *O : cast<User>(Val)->operands())
@@ -265,6 +277,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Data used for determining if the recurrence has been type-promoted.
   Type *RecurrenceType = Phi->getType();
   SmallPtrSet<Instruction *, 4> CastInsts;
+  unsigned MinWidthCastToRecurrenceType;
   Instruction *Start = Phi;
   bool IsSigned = false;
 
@@ -500,21 +513,24 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
         computeRecurrenceType(ExitInstruction, DB, AC, DT);
     if (ComputedType != RecurrenceType)
       return false;
-
-    // The recurrence expression will be represented in a narrower type. If
-    // there are any cast instructions that will be unnecessary, collect them
-    // in CastInsts. Note that the 'and' instruction was already included in
-    // this list.
-    //
-    // TODO: A better way to represent this may be to tag in some way all the
-    //       instructions that are a part of the reduction. The vectorizer cost
-    //       model could then apply the recurrence type to these instructions,
-    //       without needing a white list of instructions to ignore.
-    //       This may also be useful for the inloop reductions, if it can be
-    //       kept simple enough.
-    collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
   }
 
+  // Collect cast instructions and the minimum width used by the recurrence.
+  // If the starting value is not the same as the phi node and the computed
+  // recurrence type is equal to the recurrence type, the recurrence expression
+  // will be represented in a narrower or wider type. If there are any cast
+  // instructions that will be unnecessary, collect them in CastsFromRecurTy.
+  // Note that the 'and' instruction was already included in this list.
+  //
+  // TODO: A better way to represent this may be to tag in some way all the
+  //       instructions that are a part of the reduction. The vectorizer cost
+  //       model could then apply the recurrence type to these instructions,
+  //       without needing a white list of instructions to ignore.
+  //       This may also be useful for the inloop reductions, if it can be
+  //       kept simple enough.
+  collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts,
+                    MinWidthCastToRecurrenceType);
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -524,7 +540,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Save the description of this reduction variable.
   RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
                           ReduxDesc.getExactFPMathInst(), RecurrenceType,
-                          IsSigned, IsOrdered, CastInsts);
+                          IsSigned, IsOrdered, CastInsts,
+                          MinWidthCastToRecurrenceType);
   RedDes = RD;
 
   return true;

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a277ee37d12cc..a62bd4884fd66 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5960,11 +5960,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
   unsigned MaxWidth = 8;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-  for (Type *T : ElementTypesInLoop) {
-    MinWidth = std::min<unsigned>(
-        MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
-    MaxWidth = std::max<unsigned>(
-        MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+  // For in-loop reductions, no element types are added to ElementTypesInLoop
+  // if there are no loads/stores in the loop. In this case, check through the
+  // reduction variables to determine the maximum width.
+  if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
+    // Reset MaxWidth so that we can find the smallest type used by recurrences
+    // in the loop.
+    MaxWidth = -1U;
+    for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
+      const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
+      // When finding the min width used by the recurrence we need to account
+      // for casts on the input operands of the recurrence.
+      MaxWidth = std::min<unsigned>(
+          MaxWidth, std::min<unsigned>(
+                        RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
+                        RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
+    }
+  } else {
+    for (Type *T : ElementTypesInLoop) {
+      MinWidth = std::min<unsigned>(
+          MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+      MaxWidth = std::max<unsigned>(
+          MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+    }
   }
   return {MinWidth, MaxWidth};
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
index 1ae7dadeffd7f..fec056ad7c122 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-target-instruction-cost=1 -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -31,3 +31,74 @@ for.body:
 for.end:
   ret void
 }
+
+; For in-loop reductions with no loads or stores in the loop the widest type is
+; determined by looking through the recurrences, which allows a sensible VF to be
+; chosen. The following 3 cases check 
diff erent combinations of widths.
+
+; CHECK-LABEL: Checking a loop in "no_loads_stores_32"
+; CHECK: The Smallest and Widest types: 4294967295 / 32 bits
+; CHECK: Selecting VF: 4
+
+define double @no_loads_stores_32(i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = sitofp i32 %i.08 to float
+  %conv1 = fpext float %conv to double
+  %add = fadd double %s.09, %conv1
+  %inc = add nuw i32 %i.08, 1
+  %exitcond.not = icmp eq i32 %inc, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %.lcssa = phi double [ %add, %for.body ]
+  ret double %.lcssa
+}
+
+; CHECK-LABEL: Checking a loop in "no_loads_stores_16"
+; CHECK: The Smallest and Widest types: 4294967295 / 16 bits
+; CHECK: Selecting VF: 8
+
+define double @no_loads_stores_16() {
+entry:
+  br label %for.body
+
+for.body:
+  %s.09 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %i.08 = phi i16 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = sitofp i16 %i.08 to double
+  %add = fadd double %s.09, %conv
+  %inc = add nuw nsw i16 %i.08, 1
+  %exitcond.not = icmp eq i16 %inc, 12345
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %.lcssa = phi double [ %add, %for.body ]
+  ret double %.lcssa
+}
+
+; CHECK-LABEL: Checking a loop in "no_loads_stores_8"
+; CHECK: The Smallest and Widest types: 4294967295 / 8 bits
+; CHECK: Selecting VF: 16
+
+define float @no_loads_stores_8() {
+entry:
+  br label %for.body
+
+for.body:
+  %s.09 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %i.08 = phi i8 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = sitofp i8 %i.08 to float
+  %add = fadd float %s.09, %conv
+  %inc = add nuw nsw i8 %i.08, 1
+  %exitcond.not = icmp eq i8 %inc, 12345
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  %.lcssa = phi float [ %add, %for.body ]
+  ret float %.lcssa
+}

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
index 88f15e7e14854..87df85a69195d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/funclet.ll
@@ -33,7 +33,7 @@ unreachable:                                      ; preds = %entry
 
 ; CHECK-LABEL: define void @test1(
 ; CHECK: %[[cpad:.*]] = catchpad within {{.*}} [i8* null, i32 64, i8* null]
-; CHECK: call <16 x double> @llvm.floor.v16f64(<16 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
+; CHECK: call <8 x double> @llvm.floor.v8f64(<8 x double> {{.*}}) [ "funclet"(token %[[cpad]]) ]
 
 declare x86_stdcallcc void @_CxxThrowException(i8*, i8*)