[llvm] r366030 - [LV] Exclude loop-invariant inputs from scalar cost computation.

Sun Jul 14 13:12:37 PDT 2019

Author: fhahn
Date: Sun Jul 14 13:12:36 2019
New Revision: 366030

URL: http://llvm.org/viewvc/llvm-project?rev=366030&view=rev
Log:
[LV] Exclude loop-invariant inputs from scalar cost computation.

Loop invariant operands do not need to be scalarized, as we are using
the values outside the loop. We should ignore them when computing the
scalarization overhead.

Fixes PR41294

Reviewers: hsaito, rengolin, dcaballe, Ayal

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D59995

Added:
    llvm/trunk/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=366030&r1=366029&r2=366030&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Sun Jul 14 13:12:36 2019
@@ -1179,7 +1179,7 @@ public:
   /// VF. Return the cost of the instruction, including scalarization overhead
   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
   /// scalarized -
-  // i.e. either vector version isn't available, or is too expensive.
+  /// i.e. either vector version isn't available, or is too expensive.
   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
 
 private:
@@ -1332,6 +1332,30 @@ private:
 
   DecisionList WideningDecisions;
 
+  /// Returns true if \p V is expected to be vectorized and it needs to be
+  /// extracted.
+  bool needsExtract(Value *V, unsigned VF) const {
+    Instruction *I = dyn_cast<Instruction>(V);
+    if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
+      return false;
+
+    // Assume we can vectorize V (and hence we need extraction) if the
+    // scalars are not computed yet. This can happen, because it is called
+    // via getScalarizationOverhead from setCostBasedWideningDecision, before
+    // the scalars are collected. That should be a safe assumption in most
+    // cases, because we check if the operands have vectorizable types
+    // beforehand in LoopVectorizationLegality.
+    return Scalars.find(VF) == Scalars.end() ||
+           !isScalarAfterVectorization(I, VF);
+  };
+
+  /// Returns a range containing only operands needing to be extracted.
+  SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+                                                   unsigned VF) {
+    return SmallVector<Value *, 4>(make_filter_range(
+        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+  }
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -3125,8 +3149,11 @@ unsigned LoopVectorizationCostModel::get
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  SmallVector<Value *, 4> Operands(CI->arg_operands());
-  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return TTI.getIntrinsicInstrCost(
+      ID, CI->getType(), filterExtractingOperands(CI->arg_operands(), VF), FMF,
+      VF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -5346,15 +5373,6 @@ int LoopVectorizationCostModel::computeP
     return true;
   };
 
-  // Returns true if an operand that cannot be scalarized must be extracted
-  // from a vector. We will account for this scalarization overhead below. Note
-  // that the non-void predicated instructions are placed in their own blocks,
-  // and their return values are inserted into vectors. Thus, an extract would
-  // still be required.
-  auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
-  };
-
   // Compute the expected cost discount from scalarizing the entire expression
   // feeding the predicated instruction. We currently only consider expressions
   // that are single-use instruction chains.
@@ -5394,7 +5412,7 @@ int LoopVectorizationCostModel::computeP
                "Instruction has non-scalar type");
         if (canBeScalarized(J))
           Worklist.push_back(J);
-        else if (needsExtract(J))
+        else if (needsExtract(J, VF))
           ScalarCost += TTI.getScalarizationOverhead(
                               ToVectorTy(J->getType(),VF), false, true);
       }
@@ -5684,16 +5702,18 @@ unsigned LoopVectorizationCostModel::get
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
     return Cost;
 
-  if (CallInst *CI = dyn_cast<CallInst>(I)) {
-    SmallVector<const Value *, 4> Operands(CI->arg_operands());
-    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  } else if (!isa<StoreInst>(I) ||
-             !TTI.supportsEfficientVectorElementLoadStore()) {
-    SmallVector<const Value *, 4> Operands(I->operand_values());
-    Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  }
+  // Some targets support efficient element stores.
+  if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
+    return Cost;
 
-  return Cost;
+  // Collect operands to consider.
+  CallInst *CI = dyn_cast<CallInst>(I);
+  Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+
+  // Skip operands that do not require extraction/scalarization and do not incur
+  // any overhead.
+  return Cost + TTI.getOperandsScalarizationOverhead(
+                    filterExtractingOperands(Ops, VF), VF);
 }
 
 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {

Added: llvm/trunk/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll?rev=366030&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll Sun Jul 14 13:12:36 2019
@@ -0,0 +1,109 @@
+; REQUIRES: asserts
+
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios %s -S -debug -disable-output 2>&1 | FileCheck --check-prefix=CM %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 %s -S | FileCheck --check-prefix=FORCED %s
+
+; Test case from PR41294.
+
+; Check scalar cost for extractvalue. The constant and loop invariant operands are free,
+; leaving cost 3 for scalarizing the result + 2 for executing the op with VF 2.
+
+; CM: LV: Scalar loop costs: 7.
+; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { i64, i64 } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { i64, i64 } %sv, 1
+
+; Check that the extractvalue operands are actually free in vector code.
+
+; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
+; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT:    %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
+; FORCED-NEXT:    %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
+; FORCED-NEXT:    %0 = add i32 %index, 0
+; FORCED-NEXT:    %1 = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT:    %2 = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT:    %3 = insertelement <2 x i64> undef, i64 %1, i32 0
+; FORCED-NEXT:    %4 = insertelement <2 x i64> %3, i64 %2, i32 1
+; FORCED-NEXT:    %5 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %6 = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT:    %7 = insertelement <2 x i64> undef, i64 %5, i32 0
+; FORCED-NEXT:    %8 = insertelement <2 x i64> %7, i64 %6, i32 1
+; FORCED-NEXT:    %9 = getelementptr i64, i64* %dst, i32 %0
+; FORCED-NEXT:    %10 = add <2 x i64> %4, %8
+; FORCED-NEXT:    %11 = getelementptr i64, i64* %9, i32 0
+; FORCED-NEXT:    %12 = bitcast i64* %11 to <2 x i64>*
+; FORCED-NEXT:    store <2 x i64> %10, <2 x i64>* %12, align 4
+; FORCED-NEXT:    %index.next = add i32 %index, 2
+; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !0
+
+define void @test1(i64* %dst, {i64, i64} %sv) {
+entry:
+  br label %loop.body
+
+loop.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
+  %a = extractvalue { i64, i64 } %sv, 0
+  %b = extractvalue { i64, i64 } %sv, 1
+  %addr = getelementptr i64, i64* %dst, i32 %iv
+  %add = add i64 %a, %b
+  store i64 %add, i64* %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 0
+  br i1 %cond, label %loop.body, label %exit
+
+exit:
+  ret void
+}
+
+
+; Similar to the test case above, but checks getVectorCallCost as well.
+declare float @pow(float, float) readnone nounwind
+
+; CM: LV: Scalar loop costs: 16.
+; CM: LV: Found an estimated cost of 5 for VF 2 For instruction:   %a = extractvalue { float, float } %sv, 0
+; CM-NEXT: LV: Found an estimated cost of 5 for VF 2 For instruction:   %b = extractvalue { float, float } %sv, 1
+
+; FORCED-LABEL: define void @test_getVectorCallCost
+
+; FORCED-LABEL: vector.body:                                      ; preds = %vector.body, %vector.ph
+; FORCED-NEXT:    %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT:    %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
+; FORCED-NEXT:    %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
+; FORCED-NEXT:    %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
+; FORCED-NEXT:    %0 = add i32 %index, 0
+; FORCED-NEXT:    %1 = extractvalue { float, float } %sv, 0
+; FORCED-NEXT:    %2 = extractvalue { float, float } %sv, 0
+; FORCED-NEXT:    %3 = insertelement <2 x float> undef, float %1, i32 0
+; FORCED-NEXT:    %4 = insertelement <2 x float> %3, float %2, i32 1
+; FORCED-NEXT:    %5 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %6 = extractvalue { float, float } %sv, 1
+; FORCED-NEXT:    %7 = insertelement <2 x float> undef, float %5, i32 0
+; FORCED-NEXT:    %8 = insertelement <2 x float> %7, float %6, i32 1
+; FORCED-NEXT:    %9 = getelementptr float, float* %dst, i32 %0
+; FORCED-NEXT:    %10 = call <2 x float> @llvm.pow.v2f32(<2 x float> %4, <2 x float> %8)
+; FORCED-NEXT:    %11 = getelementptr float, float* %9, i32 0
+; FORCED-NEXT:    %12 = bitcast float* %11 to <2 x float>*
+; FORCED-NEXT:    store <2 x float> %10, <2 x float>* %12, align 4
+; FORCED-NEXT:    %index.next = add i32 %index, 2
+; FORCED-NEXT:    %13 = icmp eq i32 %index.next, 0
+; FORCED-NEXT:    br i1 %13, label %middle.block, label %vector.body, !llvm.loop !4
+
+define void @test_getVectorCallCost(float* %dst, {float, float} %sv) {
+entry:
+  br label %loop.body
+
+loop.body:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.body ]
+  %a = extractvalue { float, float } %sv, 0
+  %b = extractvalue { float, float } %sv, 1
+  %addr = getelementptr float, float* %dst, i32 %iv
+  %p = call float @pow(float %a, float %b)
+  store float %p, float* %addr
+  %iv.next = add nsw i32 %iv, 1
+  %cond = icmp ne i32 %iv.next, 0
+  br i1 %cond, label %loop.body, label %exit
+
+exit:
+  ret void
+}