[llvm] 90be3c2 - [VPlan] Introduce new VPWidenCallRecipe (NFC).

Mon Apr 6 08:08:01 PDT 2020

Author: Florian Hahn
Date: 2020-04-06T16:07:37+01:00
New Revision: 90be3c24a7162a488f68f7cce159017c10408133

URL: https://github.com/llvm/llvm-project/commit/90be3c24a7162a488f68f7cce159017c10408133
DIFF: https://github.com/llvm/llvm-project/commit/90be3c24a7162a488f68f7cce159017c10408133.diff

LOG: [VPlan] Introduce new VPWidenCallRecipe (NFC).

This patch moves calls to their own recipe, to simplify the transition
to VPUser for operands of VPWidenRecipe, as discussed in D76992.

Subsequently additional information can be added to the recipe rather
than computing it during the execute step.

Reviewers: rengolin, Ayal, gilr, hsaito

Reviewed By: gilr

Differential Revision: https://reviews.llvm.org/D77467

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
    llvm/lib/Transforms/Vectorize/VPlan.cpp
    llvm/lib/Transforms/Vectorize/VPlan.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 58358eb5d662..eecad0c3d3b5 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -409,6 +409,9 @@ class InnerLoopVectorizer {
   /// Widen a single instruction within the innermost loop.
   void widenInstruction(Instruction &I);
 
+  /// Widen a single call instruction within the innermost loop.
+  void widenCallInstruction(CallInst &I);
+
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
   void fixVectorizedLoop();
 
@@ -4224,6 +4227,7 @@ static bool mayDivideByZero(Instruction &I) {
 
 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
   switch (I.getOpcode()) {
+  case Instruction::Call:
   case Instruction::Br:
   case Instruction::PHI:
   case Instruction::GetElementPtr:
@@ -4348,54 +4352,62 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
     }
     break;
   }
+  default:
+    // This instruction is not vectorized by simple widening.
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  } // end of switch.
+}
 
-  case Instruction::Call: {
-    // Ignore dbg intrinsics.
-    if (isa<DbgInfoIntrinsic>(I))
-      break;
-    setDebugLocFromInst(Builder, &I);
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I) {
+  // Ignore dbg intrinsics.
+  // TODO: Debug intrinsics should be skipped/handled during VPlan construction
+  // rather than dropping them here.
+  if (isa<DbgInfoIntrinsic>(I))
+    return;
+  setDebugLocFromInst(Builder, &I);
 
-    Module *M = I.getParent()->getParent()->getParent();
-    auto *CI = cast<CallInst>(&I);
+  Module *M = I.getParent()->getParent()->getParent();
+  auto *CI = cast<CallInst>(&I);
 
-    SmallVector<Type *, 4> Tys;
-    for (Value *ArgOperand : CI->arg_operands())
-      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+  SmallVector<Type *, 4> Tys;
+  for (Value *ArgOperand : CI->arg_operands())
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
 
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-    // The flag shows whether we use Intrinsic or a usual Call for vectorized
-    // version of the instruction.
-    // Is it beneficial to perform intrinsic call compared to lib call?
-    bool NeedToScalarize = false;
-    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
-    bool UseVectorIntrinsic =
-        ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
-    assert((UseVectorIntrinsic || !NeedToScalarize) &&
-           "Instruction should be scalarized elsewhere.");
+  // The flag shows whether we use Intrinsic or a usual Call for vectorized
+  // version of the instruction.
+  // Is it beneficial to perform intrinsic call compared to lib call?
+  bool NeedToScalarize = false;
+  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+  bool UseVectorIntrinsic =
+      ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+  assert((UseVectorIntrinsic || !NeedToScalarize) &&
+         "Instruction should be scalarized elsewhere.");
 
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      SmallVector<Value *, 4> Args;
-      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-        Value *Arg = CI->getArgOperand(i);
-        // Some intrinsics have a scalar argument - don't replace it with a
-        // vector.
-        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
-          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
-        Args.push_back(Arg);
-      }
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    SmallVector<Value *, 4> Args;
+    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+      Value *Arg = CI->getArgOperand(i);
+      // Some intrinsics have a scalar argument - don't replace it with a
+      // vector.
+      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+        Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
+      Args.push_back(Arg);
+    }
 
-      Function *VectorF;
-      if (UseVectorIntrinsic) {
-        // Use vector version of the intrinsic.
-        Type *TysForDecl[] = {CI->getType()};
-        if (VF > 1)
-          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
-      } else {
-        // Use vector version of the function call.
-        const VFShape Shape =
-            VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+    Function *VectorF;
+    if (UseVectorIntrinsic) {
+      // Use vector version of the intrinsic.
+      Type *TysForDecl[] = {CI->getType()};
+      if (VF > 1)
+        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+    } else {
+      // Use vector version of the function call.
+      const VFShape Shape =
+          VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
 #ifndef NDEBUG
         const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
         assert(std::find_if(Infos.begin(), Infos.end(),
@@ -4405,7 +4417,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
                "Vector function shape is missing from the database.");
 #endif
         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
-      }
+    }
       assert(VectorF && "Can't create vector function.");
 
       SmallVector<OperandBundleDef, 1> OpBundles;
@@ -4417,16 +4429,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
       VectorLoopValueMap.setVectorValue(&I, Part, V);
       addMetadata(V, &I);
-    }
-
-    break;
   }
-
-  default:
-    // This instruction is not vectorized by simple widening.
-    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
-    llvm_unreachable("Unhandled instruction!");
-  } // end of switch.
 }
 
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@@ -6884,11 +6887,45 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
   return new VPBlendRecipe(Phi, Masks);
 }
 
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(Instruction *I,
+                                                   VFRange &Range) {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
 
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (IsPredicated || !CI)
+    return nullptr;
+
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+    return nullptr;
+
+  auto willWiden = [&](unsigned VF) -> bool {
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    // The following case may be scalarized depending on the VF.
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize;
+    unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+    return UseVectorIntrinsic || !NeedToScalarize;
+  };
+
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  // Success: widen this call.
+  return new VPWidenCallRecipe(*CI);
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
+  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+
   if (IsPredicated)
     return nullptr;
 
@@ -6899,7 +6936,6 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
     case Instruction::AShr:
     case Instruction::BitCast:
     case Instruction::Br:
-    case Instruction::Call:
     case Instruction::FAdd:
     case Instruction::FCmp:
     case Instruction::FDiv:
@@ -6941,29 +6977,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
   if (!IsVectorizableOpcode(I->getOpcode()))
     return nullptr;
 
-  if (CallInst *CI = dyn_cast<CallInst>(I)) {
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-               ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
-      return nullptr;
-  }
-
   auto willWiden = [&](unsigned VF) -> bool {
     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
                              CM.isProfitableToScalarize(I, VF)))
       return false;
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      // The following case may be scalarized depending on the VF.
-      // The flag shows whether we use Intrinsic or a usual Call for vectorized
-      // version of the instruction.
-      // Is it beneficial to perform intrinsic call compared to lib call?
-      bool NeedToScalarize;
-      unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
-      bool UseVectorIntrinsic =
-          ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
-      return UseVectorIntrinsic || !NeedToScalarize;
-    }
     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
       assert(CM.getWideningDecision(I, VF) ==
                  LoopVectorizationCostModel::CM_Scalarize &&
@@ -7052,9 +7069,10 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
   VPRecipeBase *Recipe = nullptr;
 
-  // First, check for specific widening recipes that deal with memory
+  // First, check for specific widening recipes that deal with calls, memory
   // operations, inductions and Phi nodes.
-  if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
+  if ((Recipe = tryToWidenCall(Instr, Range)) ||
+      (Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
       (Recipe = tryToBlend(Instr, Plan)) ||
       (isa<PHINode>(Instr) &&
@@ -7371,6 +7389,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
 }
 
+void VPWidenCallRecipe::execute(VPTransformState &State) {
+  State.ILV->widenCallInstruction(Ingredient);
+}
+
 void VPWidenRecipe::execute(VPTransformState &State) {
   State.ILV->widenInstruction(Ingredient);
 }

diff  --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index ab9cd774f428..d5435c23e65c 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -107,6 +107,11 @@ class VPRecipeBuilder {
   /// full if-conversion.
   VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
 
+  /// Handle call instruction. If \p I is a call that can be widened for \p
+  /// Range.Start, return a new VPWidenCallRecipe. Range.End may be decreased to
+  /// ensure same decision from \p Range.Start to \p Range.End.
+  VPWidenCallRecipe *tryToWidenCall(Instruction *I, VFRange &Range);
+
   /// Check if \p I can be widened within the given VF \p Range. If \p I can be
   /// widened for \p Range.Start, build a new VPWidenRecipe and return it.
   /// Range.End may be decreased to ensure same decision from \p Range.Start to

diff  --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 7944cf540538..17b4a3da6a06 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -712,6 +712,12 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
   O << DOT::EscapeString(IngredientString);
 }
 
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << " +\n"
+    << Indent << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient) << "\\l\"";
+}
+
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {
   O << " +\n" << Indent << "\"WIDEN\\l\"";

diff  --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 68928d02d56d..7e1f74735918 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -609,6 +609,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
     VPInterleaveSC,
     VPPredInstPHISC,
     VPReplicateSC,
+    VPWidenCallSC,
     VPWidenGEPSC,
     VPWidenIntOrFpInductionSC,
     VPWidenMemoryInstructionSC,
@@ -783,6 +784,30 @@ class VPWidenRecipe : public VPRecipeBase {
              VPSlotTracker &SlotTracker) const override;
 };
 
+/// A recipe for widening Call instructions.
+class VPWidenCallRecipe : public VPRecipeBase {
+private:
+  /// Hold the call to be widened.
+  CallInst &Ingredient;
+
+public:
+  VPWidenCallRecipe(CallInst &I) : VPRecipeBase(VPWidenCallSC), Ingredient(I) {}
+
+  ~VPWidenCallRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC;
+  }
+
+  /// Produce a widened version of the call instruction.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
 /// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeBase {
 private: