[llvm] 6a6fc48 - [SLP][NFC]Formatting of the getEntryCost function, NFC.

Alexey Bataev via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 18 07:19:10 PDT 2022


Author: Alexey Bataev
Date: 2022-10-18T07:18:26-07:00
New Revision: 6a6fc4890d9fa0bc5fef738c5ad73fb958606f54

URL: https://github.com/llvm/llvm-project/commit/6a6fc4890d9fa0bc5fef738c5ad73fb958606f54
DIFF: https://github.com/llvm/llvm-project/commit/6a6fc4890d9fa0bc5fef738c5ad73fb958606f54.diff

LOG: [SLP][NFC]Formatting of the getEntryCost function, NFC.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 429dec7c9c3d1..56710eee0de24 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6013,7 +6013,7 @@ TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> VL,
 
 InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                                       ArrayRef<Value *> VectorizedVals) {
-  ArrayRef<Value*> VL = E->Scalars;
+  ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -6034,10 +6034,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-  // FIXME: it tries to fix a problem with MSVC buildbots.
-  TargetTransformInfo &TTIRef = *TTI;
-  auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
-                               VectorizedVals, E](InstructionCost &Cost) {
+  auto AdjustExtractsCost = [=](InstructionCost &Cost) {
     DenseMap<Value *, int> ExtractVectorsTys;
     SmallPtrSet<Value *, 4> CheckedExtracts;
     for (auto *V : VL) {
@@ -6059,8 +6056,8 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       if (!EEIdx)
         continue;
       unsigned Idx = *EEIdx;
-      if (TTIRef.getNumberOfParts(VecTy) !=
-          TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
+      if (TTI->getNumberOfParts(VecTy) !=
+          TTI->getNumberOfParts(EE->getVectorOperandType())) {
         auto It =
             ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
         It->getSecond() = std::min<int>(It->second, Idx);
@@ -6074,16 +6071,16 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
           // Use getExtractWithExtendCost() to calculate the cost of
           // extractelement/ext pair.
           Cost -=
-              TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                              EE->getVectorOperandType(), Idx);
+              TTI->getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                            EE->getVectorOperandType(), Idx);
           // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTIRef.getCastInstrCost(
+          Cost += TTI->getCastInstrCost(
               Ext->getOpcode(), Ext->getType(), EE->getType(),
               TTI::getCastContextHint(Ext), CostKind, Ext);
           continue;
         }
       }
-      Cost -= TTIRef.getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx);
+      Cost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(), Idx);
     }
     // Add a cost for subvector extracts/inserts if required.
     for (const auto &Data : ExtractVectorsTys) {
@@ -6091,26 +6088,24 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
       unsigned NumElts = VecTy->getNumElements();
       if (Data.second % NumElts == 0)
         continue;
-      if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
+      if (TTI->getNumberOfParts(EEVTy) > TTI->getNumberOfParts(VecTy)) {
         unsigned Idx = (Data.second / NumElts) * NumElts;
         unsigned EENumElts = EEVTy->getNumElements();
         if (Idx + NumElts <= EENumElts) {
-          Cost +=
-              TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                    EEVTy, None, CostKind, Idx, VecTy);
+          Cost += TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                      EEVTy, None, CostKind, Idx, VecTy);
         } else {
           // Need to round up the subvector type vectorization factor to avoid a
           // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
           // <= EENumElts.
           auto *SubVT =
               FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost +=
-              TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                    EEVTy, None, CostKind, Idx, SubVT);
+          Cost += TTI->getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                      EEVTy, None, CostKind, Idx, SubVT);
         }
       } else {
-        Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                      VecTy, None, CostKind, 0, EEVTy);
+        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+                                    VecTy, None, CostKind, 0, EEVTy);
       }
     }
   };
@@ -6248,19 +6243,19 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
         InstructionCost ScalarsCost = 0;
         for (Value *V : VectorizedLoads) {
           auto *LI = cast<LoadInst>(V);
-          ScalarsCost += TTI->getMemoryOpCost(
-              Instruction::Load, LI->getType(), LI->getAlign(),
-              LI->getPointerAddressSpace(), CostKind,
-              {TTI::OK_AnyValue, TTI::OP_None}, LI);
+          ScalarsCost +=
+              TTI->getMemoryOpCost(Instruction::Load, LI->getType(),
+                                   LI->getAlign(), LI->getPointerAddressSpace(),
+                                   CostKind, TTI::OperandValueInfo(), LI);
         }
         auto *LI = cast<LoadInst>(E->getMainOp());
         auto *LoadTy = FixedVectorType::get(LI->getType(), VF);
         Align Alignment = LI->getAlign();
-        GatherCost += VectorizedCnt *
-                      TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
-                                           LI->getPointerAddressSpace(),
-                                           CostKind, {TTI::OK_AnyValue,
-                                                      TTI::OP_None}, LI);
+        GatherCost +=
+            VectorizedCnt *
+            TTI->getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
+                                 LI->getPointerAddressSpace(), CostKind,
+                                 TTI::OperandValueInfo(), LI);
         GatherCost += ScatterVectorizeCnt *
                       TTI->getGatherScatterOpCost(
                           Instruction::Load, LoadTy, LI->getPointerOperand(),
@@ -6306,48 +6301,48 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   switch (ShuffleOrOp) {
-    case Instruction::PHI:
-      return 0;
+  case Instruction::PHI:
+    return 0;
 
-    case Instruction::ExtractValue:
-    case Instruction::ExtractElement: {
-      // The common cost of removal ExtractElement/ExtractValue instructions +
-      // the cost of shuffles, if required to resuffle the original vector.
-      if (NeedToShuffleReuses) {
-        unsigned Idx = 0;
-        for (unsigned I : E->ReuseShuffleIndices) {
-          if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *EE = cast<ExtractElementInst>(VL[I]);
-            CommonCost -= TTI->getVectorInstrCost(
-                *EE, EE->getVectorOperandType(), *getExtractIndex(EE));
-          } else {
-            CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VecTy, Idx);
-            ++Idx;
-          }
+  case Instruction::ExtractValue:
+  case Instruction::ExtractElement: {
+    // The common cost of removal ExtractElement/ExtractValue instructions +
+    // the cost of shuffles, if required to resuffle the original vector.
+    if (NeedToShuffleReuses) {
+      unsigned Idx = 0;
+      for (unsigned I : E->ReuseShuffleIndices) {
+        if (ShuffleOrOp == Instruction::ExtractElement) {
+          auto *EE = cast<ExtractElementInst>(VL[I]);
+          CommonCost -= TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(),
+                                                *getExtractIndex(EE));
+        } else {
+          CommonCost -=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+          ++Idx;
         }
-        Idx = EntryVF;
-        for (Value *V : VL) {
-          if (ShuffleOrOp == Instruction::ExtractElement) {
-            auto *EE = cast<ExtractElementInst>(V);
-            CommonCost += TTI->getVectorInstrCost(
-                *EE, EE->getVectorOperandType(), *getExtractIndex(EE));
-          } else {
-            --Idx;
-            CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
-                                                  VecTy, Idx);
-          }
+      }
+      Idx = EntryVF;
+      for (Value *V : VL) {
+        if (ShuffleOrOp == Instruction::ExtractElement) {
+          auto *EE = cast<ExtractElementInst>(V);
+          CommonCost += TTI->getVectorInstrCost(*EE, EE->getVectorOperandType(),
+                                                *getExtractIndex(EE));
+        } else {
+          --Idx;
+          CommonCost +=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      if (ShuffleOrOp == Instruction::ExtractValue) {
-        for (unsigned I = 0, E = VL.size(); I < E; ++I) {
-          auto *EI = cast<Instruction>(VL[I]);
-          // Take credit for instruction that will become dead.
-          if (EI->hasOneUse()) {
-            Instruction *Ext = EI->user_back();
-            if (isa<SExtInst, ZExtInst>(Ext) &&
-                all_of(Ext->users(),
-                       [](User *U) { return isa<GetElementPtrInst>(U); })) {
+    }
+    if (ShuffleOrOp == Instruction::ExtractValue) {
+      for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+        auto *EI = cast<Instruction>(VL[I]);
+        // Take credit for instruction that will become dead.
+        if (EI->hasOneUse()) {
+          Instruction *Ext = EI->user_back();
+          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+                return isa<GetElementPtrInst>(U);
+              })) {
             // Use getExtractWithExtendCost() to calculate the cost of
             // extractelement/ext pair.
             CommonCost -= TTI->getExtractWithExtendCost(
@@ -6357,462 +6352,453 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
                 Ext->getOpcode(), Ext->getType(), EI->getType(),
                 TTI::getCastContextHint(Ext), CostKind, Ext);
             continue;
-            }
           }
-          CommonCost -=
-              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
         }
+        CommonCost -=
+            TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
+      }
+    } else {
+      AdjustExtractsCost(CommonCost);
+    }
+    return CommonCost;
+  }
+  case Instruction::InsertElement: {
+    assert(E->ReuseShuffleIndices.empty() &&
+           "Unique insertelements only are expected.");
+    auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
+    unsigned const NumElts = SrcVecTy->getNumElements();
+    unsigned const NumScalars = VL.size();
+
+    unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+
+    SmallVector<int> InsertMask(NumElts, UndefMaskElem);
+    unsigned OffsetBeg = *getInsertIndex(VL.front());
+    unsigned OffsetEnd = OffsetBeg;
+    InsertMask[OffsetBeg] = 0;
+    for (auto [I, V] : enumerate(VL.drop_front())) {
+      unsigned Idx = *getInsertIndex(V);
+      if (OffsetBeg > Idx)
+        OffsetBeg = Idx;
+      else if (OffsetEnd < Idx)
+        OffsetEnd = Idx;
+      InsertMask[Idx] = I + 1;
+    }
+    unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
+    if (NumOfParts > 0)
+      VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
+    unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
+                     VecScalarsSz;
+    unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
+    unsigned InsertVecSz = std::min<unsigned>(
+        PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
+        ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
+    bool IsWholeSubvector =
+        OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
+    // Check if we can safely insert a subvector. If it is not possible, just
+    // generate a whole-sized vector and shuffle the source vector and the new
+    // subvector.
+    if (OffsetBeg + InsertVecSz > VecSz) {
+      // Align OffsetBeg to generate correct mask.
+      OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
+      InsertVecSz = VecSz;
+    }
+
+    APInt DemandedElts = APInt::getZero(NumElts);
+    // TODO: Add support for Instruction::InsertValue.
+    SmallVector<int> Mask;
+    if (!E->ReorderIndices.empty()) {
+      inversePermutation(E->ReorderIndices, Mask);
+      Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
+    } else {
+      Mask.assign(VecSz, UndefMaskElem);
+      std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
+    }
+    bool IsIdentity = true;
+    SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
+    Mask.swap(PrevMask);
+    for (unsigned I = 0; I < NumScalars; ++I) {
+      unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
+      DemandedElts.setBit(InsertIdx);
+      IsIdentity &= InsertIdx - OffsetBeg == I;
+      Mask[InsertIdx - OffsetBeg] = I;
+    }
+    assert(Offset < NumElts && "Failed to find vector index offset");
+
+    InstructionCost Cost = 0;
+    Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
+                                          /*Insert*/ true, /*Extract*/ false);
+
+    // First cost - resize to actual vector size if not identity shuffle or
+    // need to shift the vector.
+    // Do not calculate the cost if the actual size is the register size and
+    // we can merge this shuffle with the following SK_Select.
+    auto *InsertVecTy =
+        FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
+    if (!IsIdentity)
+      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                  InsertVecTy, Mask);
+    auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
+      return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
+    }));
+    // Second cost - permutation with subvector, if some elements are from the
+    // initial vector or inserting a subvector.
+    // TODO: Implement the analysis of the FirstInsert->getOperand(0)
+    // subvector of ActualVecTy.
+    SmallBitVector InMask =
+        isUndefVector(FirstInsert->getOperand(0), InsertMask);
+    if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
+      if (InsertVecSz != VecSz) {
+        auto *ActualVecTy =
+            FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
+        Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, None,
+                                    CostKind, OffsetBeg - Offset, InsertVecTy);
       } else {
-        AdjustExtractsCost(CommonCost);
+        for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
+          Mask[I] = InMask.test(I) ? UndefMaskElem : I;
+        for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
+             I <= End; ++I)
+          if (Mask[I] != UndefMaskElem)
+            Mask[I] = I + VecSz;
+        for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
+          Mask[I] = InMask.test(I) ? UndefMaskElem : I;
+        Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
       }
-      return CommonCost;
     }
-    case Instruction::InsertElement: {
-      assert(E->ReuseShuffleIndices.empty() &&
-             "Unique insertelements only are expected.");
-      auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
-      unsigned const NumElts = SrcVecTy->getNumElements();
-      unsigned const NumScalars = VL.size();
+    return Cost;
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    InstructionCost ScalarEltCost =
+        TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
+                              TTI::getCastContextHint(VL0), CostKind, VL0);
+    if (NeedToShuffleReuses) {
+      CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+    }
 
-      unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
+    // Calculate the cost of this instruction.
+    InstructionCost ScalarCost = VL.size() * ScalarEltCost;
 
-      SmallVector<int> InsertMask(NumElts, UndefMaskElem);
-      unsigned OffsetBeg = *getInsertIndex(VL.front());
-      unsigned OffsetEnd = OffsetBeg;
-      InsertMask[OffsetBeg] = 0;
-      for (auto [I, V] : enumerate(VL.drop_front())) {
-        unsigned Idx = *getInsertIndex(V);
-        if (OffsetBeg > Idx)
-          OffsetBeg = Idx;
-        else if (OffsetEnd < Idx)
-          OffsetEnd = Idx;
-        InsertMask[Idx] = I + 1;
-      }
-      unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
-      if (NumOfParts > 0)
-        VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
-      unsigned VecSz =
-          (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
-          VecScalarsSz;
-      unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
-      unsigned InsertVecSz = std::min<unsigned>(
-          PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
-          ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) *
-              VecScalarsSz);
-      bool IsWholeSubvector =
-          OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
-      // Check if we can safely insert a subvector. If it is not possible, just
-      // generate a whole-sized vector and shuffle the source vector and the new
-      // subvector.
-      if (OffsetBeg + InsertVecSz > VecSz) {
-        // Align OffsetBeg to generate correct mask.
-        OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
-        InsertVecSz = VecSz;
-      }
-
-      APInt DemandedElts = APInt::getZero(NumElts);
-      // TODO: Add support for Instruction::InsertValue.
-      SmallVector<int> Mask;
-      if (!E->ReorderIndices.empty()) {
-        inversePermutation(E->ReorderIndices, Mask);
-        Mask.append(InsertVecSz - Mask.size(), UndefMaskElem);
-      } else {
-        Mask.assign(VecSz, UndefMaskElem);
-        std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
-      }
-      bool IsIdentity = true;
-      SmallVector<int> PrevMask(InsertVecSz, UndefMaskElem);
-      Mask.swap(PrevMask);
-      for (unsigned I = 0; I < NumScalars; ++I) {
-        unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
-        DemandedElts.setBit(InsertIdx);
-        IsIdentity &= InsertIdx - OffsetBeg == I;
-        Mask[InsertIdx - OffsetBeg] = I;
-      }
-      assert(Offset < NumElts && "Failed to find vector index offset");
-
-      InstructionCost Cost = 0;
-      Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
-                                            /*Insert*/ true, /*Extract*/ false);
-
-      // First cost - resize to actual vector size if not identity shuffle or
-      // need to shift the vector.
-      // Do not calculate the cost if the actual size is the register size and
-      // we can merge this shuffle with the following SK_Select.
-      auto *InsertVecTy =
-          FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz);
-      if (!IsIdentity)
-        Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
-                                    InsertVecTy, Mask);
-      auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
-        return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
-      }));
-      // Second cost - permutation with subvector, if some elements are from the
-      // initial vector or inserting a subvector.
-      // TODO: Implement the analysis of the FirstInsert->getOperand(0)
-      // subvector of ActualVecTy.
-      SmallBitVector InMask =
-          isUndefVector(FirstInsert->getOperand(0), InsertMask);
-      if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
-        if (InsertVecSz != VecSz) {
-          auto *ActualVecTy =
-              FixedVectorType::get(SrcVecTy->getElementType(), VecSz);
-          Cost +=
-              TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, None,
-                                  CostKind, OffsetBeg - Offset, InsertVecTy);
-        } else {
-          for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
-            Mask[I] = InMask.test(I) ? UndefMaskElem : I;
-          for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
-               I <= End; ++I)
-            if (Mask[I] != UndefMaskElem)
-              Mask[I] = I + VecSz;
-          for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
-            Mask[I] = InMask.test(I) ? UndefMaskElem : I;
-          Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
-        }
-      }
-      return Cost;
+    auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
+    InstructionCost VecCost = 0;
+    // Check if the values are candidates to demote.
+    if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+      VecCost = CommonCost + TTI->getCastInstrCost(
+                                 E->getOpcode(), VecTy, SrcVecTy,
+                                 TTI::getCastContextHint(VL0), CostKind, VL0);
     }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcTy = VL0->getOperand(0)->getType();
-      InstructionCost ScalarEltCost =
-          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
-                                TTI::getCastContextHint(VL0), CostKind, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-
-      // Calculate the cost of this instruction.
-      InstructionCost ScalarCost = VL.size() * ScalarEltCost;
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+    return VecCost - ScalarCost;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+  case Instruction::Select: {
+    // Calculate the cost of this instruction.
+    InstructionCost ScalarEltCost =
+        TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+                                CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
+    if (NeedToShuffleReuses) {
+      CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+    }
+    auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+    InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
 
-      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
-      InstructionCost VecCost = 0;
-      // Check if the values are candidates to demote.
-      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
-        VecCost = CommonCost + TTI->getCastInstrCost(
-                                   E->getOpcode(), VecTy, SrcVecTy,
-                                   TTI::getCastContextHint(VL0), CostKind, VL0);
+    // Check if all entries in VL are either compares or selects with compares
+    // as condition that have the same predicates.
+    CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
+    bool First = true;
+    for (auto *V : VL) {
+      CmpInst::Predicate CurrentPred;
+      auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+      if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
+           !match(V, MatchCmp)) ||
+          (!First && VecPred != CurrentPred)) {
+        VecPred = CmpInst::BAD_ICMP_PREDICATE;
+        break;
       }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return VecCost - ScalarCost;
+      First = false;
+      VecPred = CurrentPred;
     }
-    case Instruction::FCmp:
-    case Instruction::ICmp:
-    case Instruction::Select: {
-      // Calculate the cost of this instruction.
-      InstructionCost ScalarEltCost =
-          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
-                                  CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-
-      // Check if all entries in VL are either compares or selects with compares
-      // as condition that have the same predicates.
-      CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
-      bool First = true;
-      for (auto *V : VL) {
-        CmpInst::Predicate CurrentPred;
-        auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
-        if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
-             !match(V, MatchCmp)) ||
-            (!First && VecPred != CurrentPred)) {
-          VecPred = CmpInst::BAD_ICMP_PREDICATE;
-          break;
-        }
-        First = false;
-        VecPred = CurrentPred;
-      }
 
-      InstructionCost VecCost = TTI->getCmpSelInstrCost(
-          E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
-      // Check if it is possible and profitable to use min/max for selects in
-      // VL.
-      //
-      auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
-      if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
-        IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
-                                          {VecTy, VecTy});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be dead
-        // and we can adjust the cost by removing their cost.
-        if (IntrinsicAndUse.second)
-          IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
-                                                   MaskTy, VecPred, CostKind);
-        VecCost = std::min(VecCost, IntrinsicCost);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
+    InstructionCost VecCost = TTI->getCmpSelInstrCost(
+        E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+    // Check if it is possible and profitable to use min/max for selects in
+    // VL.
+    //
+    auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
+    if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
+      IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
+                                        {VecTy, VecTy});
+      InstructionCost IntrinsicCost =
+          TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+      // If the selects are the only uses of the compares, they will be dead
+      // and we can adjust the cost by removing their cost.
+      if (IntrinsicAndUse.second)
+        IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
+                                                 MaskTy, VecPred, CostKind);
+      VecCost = std::min(VecCost, IntrinsicCost);
     }
-    case Instruction::FNeg:
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      const unsigned OpIdx = isa<BinaryOperator>(VL0) ? 1 : 0;
-
-      InstructionCost ScalarCost = 0;
-      for (auto *V : VL) {
-        auto *VI = cast<Instruction>(V);
-        TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
-        TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(VI->getOperand(OpIdx));
-        SmallVector<const Value *, 4> Operands(VI->operand_values());
-        ScalarCost +=
-          TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind,
-                                      Op1Info, Op2Info, Operands, VI);
-      }
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarCost/VL.size();
-      }
-      TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
-      TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
-      InstructionCost VecCost =
-          TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind,
-                                      Op1Info, Op2Info);
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+    return CommonCost + VecCost - ScalarCost;
+  }
+  case Instruction::FNeg:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const unsigned OpIdx = isa<BinaryOperator>(VL0) ? 1 : 0;
+
+    InstructionCost ScalarCost = 0;
+    for (auto *V : VL) {
+      auto *VI = cast<Instruction>(V);
+      TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
+      TTI::OperandValueInfo Op2Info =
+          TTI::getOperandInfo(VI->getOperand(OpIdx));
+      SmallVector<const Value *, 4> Operands(VI->operand_values());
+      ScalarCost += TTI->getArithmeticInstrCost(
+          E->getOpcode(), ScalarTy, CostKind, Op1Info, Op2Info, Operands, VI);
     }
-    case Instruction::GetElementPtr: {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          any_of(VL,
-                 [](Value *V) {
-                   return isa<GetElementPtrInst>(V) &&
-                          !isConstant(
-                              cast<GetElementPtrInst>(V)->getOperand(1));
-                 })
-              ? TargetTransformInfo::OK_AnyValue
-              : TargetTransformInfo::OK_UniformConstantValue;
-
-      InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
-          Instruction::Add, ScalarTy, CostKind,
-          {Op1VK, TargetTransformInfo::OP_None},
-          {Op2VK, TargetTransformInfo::OP_None});
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecCost = TTI->getArithmeticInstrCost(
-          Instruction::Add, VecTy, CostKind,
-          {Op1VK, TargetTransformInfo::OP_None},
-          {Op2VK, TargetTransformInfo::OP_None});
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
+    if (NeedToShuffleReuses) {
+      CommonCost -= (EntryVF - VL.size()) * ScalarCost / VL.size();
+    }
+    TTI::OperandValueInfo Op1Info = getOperandInfo(VL, 0);
+    TTI::OperandValueInfo Op2Info = getOperandInfo(VL, OpIdx);
+    InstructionCost VecCost = TTI->getArithmeticInstrCost(
+        E->getOpcode(), VecTy, CostKind, Op1Info, Op2Info);
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+    return CommonCost + VecCost - ScalarCost;
+  }
+  case Instruction::GetElementPtr: {
+    TargetTransformInfo::OperandValueKind Op1VK =
+        TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+        any_of(VL,
+               [](Value *V) {
+                 return isa<GetElementPtrInst>(V) &&
+                        !isConstant(cast<GetElementPtrInst>(V)->getOperand(1));
+               })
+            ? TargetTransformInfo::OK_AnyValue
+            : TargetTransformInfo::OK_UniformConstantValue;
+
+    InstructionCost ScalarEltCost =
+        TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
+                                    {Op1VK, TargetTransformInfo::OP_None},
+                                    {Op2VK, TargetTransformInfo::OP_None});
+    if (NeedToShuffleReuses) {
+      CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
     }
-    case Instruction::Load: {
-      // Cost of wide load - cost of scalar loads.
-      Align Alignment = cast<LoadInst>(VL0)->getAlign();
-      InstructionCost ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0,
-                               CostKind, {TTI::OK_AnyValue, TTI::OP_None}, VL0);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
-      InstructionCost VecLdCost;
-      if (E->State == TreeEntry::Vectorize) {
-        VecLdCost =
-            TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
-                                 CostKind, TTI::OperandValueInfo(), VL0);
-        for (Value *V : VL) {
-          auto *VI = cast<LoadInst>(V);
-          // Add the costs of scalar GEP pointers, to be removed from the code.
-          if (VI == VL0)
-            continue;
-          auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
-          if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
-            continue;
-          ScalarLdCost += TTI->getArithmeticInstrCost(Instruction::Add,
-                                                      Ptr->getType(), CostKind);
-        }
-      } else {
-        assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
-        Align CommonAlignment = Alignment;
-        for (Value *V : VL)
-          CommonAlignment =
-              std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
-        VecLdCost = TTI->getGatherScatterOpCost(
-            Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
-            /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
-      }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
-      return CommonCost + VecLdCost - ScalarLdCost;
+    InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+    InstructionCost VecCost =
+        TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
+                                    {Op1VK, TargetTransformInfo::OP_None},
+                                    {Op2VK, TargetTransformInfo::OP_None});
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+    return CommonCost + VecCost - ScalarCost;
+  }
+  case Instruction::Load: {
+    // Cost of wide load - cost of scalar loads.
+    Align Alignment = cast<LoadInst>(VL0)->getAlign();
+    InstructionCost ScalarEltCost =
+        TTI->getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, 0,
+                             CostKind, {TTI::OK_AnyValue, TTI::OP_None}, VL0);
+    if (NeedToShuffleReuses) {
+      CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
     }
-    case Instruction::Store: {
-      // We know that we can merge the stores. Calculate the cost.
-      bool IsReorder = !E->ReorderIndices.empty();
-      auto *SI =
-          cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
-      Align Alignment = SI->getAlign();
-      InstructionCost ScalarStCost = 0;
-      for (auto *V : VL) {
-        auto *VI = cast<StoreInst>(V);
-        TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getOperand(0));
-        ScalarStCost +=
-          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
-                               CostKind, OpInfo, VI);
+    InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+    InstructionCost VecLdCost;
+    if (E->State == TreeEntry::Vectorize) {
+      VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
+                                       CostKind, TTI::OperandValueInfo(), VL0);
+      for (Value *V : VL) {
+        auto *VI = cast<LoadInst>(V);
         // Add the costs of scalar GEP pointers, to be removed from the code.
-        if (VI == SI)
+        if (VI == VL0)
           continue;
         auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
         if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
           continue;
-        ScalarStCost += TTI->getArithmeticInstrCost(Instruction::Add,
+        ScalarLdCost += TTI->getArithmeticInstrCost(Instruction::Add,
                                                     Ptr->getType(), CostKind);
       }
-      TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
-      InstructionCost VecStCost =
-        TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, 0, CostKind,
-                             OpInfo);
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
-      return CommonCost + VecStCost - ScalarStCost;
+    } else {
+      assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+      Align CommonAlignment = Alignment;
+      for (Value *V : VL)
+        CommonAlignment =
+            std::min(CommonAlignment, cast<LoadInst>(V)->getAlign());
+      VecLdCost = TTI->getGatherScatterOpCost(
+          Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
+          /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
     }
-    case Instruction::Call: {
-      CallInst *CI = cast<CallInst>(VL0);
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
+    return CommonCost + VecLdCost - ScalarLdCost;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    bool IsReorder = !E->ReorderIndices.empty();
+    auto *SI = cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+    Align Alignment = SI->getAlign();
+    InstructionCost ScalarStCost = 0;
+    for (auto *V : VL) {
+      auto *VI = cast<StoreInst>(V);
+      TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getOperand(0));
+      ScalarStCost += TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
+                                           Alignment, 0, CostKind, OpInfo, VI);
+      // Add the costs of scalar GEP pointers, to be removed from the code.
+      if (VI == SI)
+        continue;
+      auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
+      if (!Ptr || !Ptr->hasOneUse() || Ptr->hasAllConstantIndices())
+        continue;
+      ScalarStCost += TTI->getArithmeticInstrCost(Instruction::Add,
+                                                  Ptr->getType(), CostKind);
+    }
+    TTI::OperandValueInfo OpInfo = getOperandInfo(VL, 0);
+    InstructionCost VecStCost = TTI->getMemoryOpCost(
+        Instruction::Store, VecTy, Alignment, 0, CostKind, OpInfo);
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
+    return CommonCost + VecStCost - ScalarStCost;
+  }
+  case Instruction::Call: {
+    CallInst *CI = cast<CallInst>(VL0);
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-      // Calculate the cost of the scalar and vector calls.
-      IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
-      InstructionCost ScalarEltCost =
-          TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-      if (NeedToShuffleReuses) {
-        CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
-      }
-      InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+    // Calculate the cost of the scalar and vector calls.
+    IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
+    InstructionCost ScalarEltCost =
+        TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+    if (NeedToShuffleReuses) {
+      CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
+    }
+    InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
-      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
-      InstructionCost VecCallCost =
-          std::min(VecCallCosts.first, VecCallCosts.second);
+    auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+    InstructionCost VecCallCost =
+        std::min(VecCallCosts.first, VecCallCosts.second);
 
-      LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
-                        << " (" << VecCallCost << "-" << ScalarCallCost << ")"
-                        << " for " << *CI << "\n");
+    LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+                      << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+                      << " for " << *CI << "\n");
 
-      return CommonCost + VecCallCost - ScalarCallCost;
-    }
-    case Instruction::ShuffleVector: {
-      assert(E->isAltShuffle() &&
-             ((Instruction::isBinaryOp(E->getOpcode()) &&
-               Instruction::isBinaryOp(E->getAltOpcode())) ||
-              (Instruction::isCast(E->getOpcode()) &&
-               Instruction::isCast(E->getAltOpcode())) ||
-              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
-             "Invalid Shuffle Vector Operand");
-      InstructionCost ScalarCost = 0;
-      if (NeedToShuffleReuses) {
-        for (unsigned Idx : E->ReuseShuffleIndices) {
-          Instruction *I = cast<Instruction>(VL[Idx]);
-          CommonCost -= TTI->getInstructionCost(I, CostKind);
-        }
-        for (Value *V : VL) {
-          Instruction *I = cast<Instruction>(V);
-          CommonCost += TTI->getInstructionCost(I, CostKind);
-        }
+    return CommonCost + VecCallCost - ScalarCallCost;
+  }
+  case Instruction::ShuffleVector: {
+    assert(E->isAltShuffle() &&
+           ((Instruction::isBinaryOp(E->getOpcode()) &&
+             Instruction::isBinaryOp(E->getAltOpcode())) ||
+            (Instruction::isCast(E->getOpcode()) &&
+             Instruction::isCast(E->getAltOpcode())) ||
+            (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+           "Invalid Shuffle Vector Operand");
+    InstructionCost ScalarCost = 0;
+    if (NeedToShuffleReuses) {
+      for (unsigned Idx : E->ReuseShuffleIndices) {
+        Instruction *I = cast<Instruction>(VL[Idx]);
+        CommonCost -= TTI->getInstructionCost(I, CostKind);
       }
       for (Value *V : VL) {
         Instruction *I = cast<Instruction>(V);
-        assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-        ScalarCost += TTI->getInstructionCost(I, CostKind);
-      }
-      // VecCost is equal to sum of the cost of creating 2 vectors
-      // and the cost of creating shuffle.
-      InstructionCost VecCost = 0;
-      // Try to find the previous shuffle node with the same operands and same
-      // main/alternate ops.
-      auto &&TryFindNodeWithEqualOperands = [this, E]() {
-        for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
-          if (TE.get() == E)
-            break;
-          if (TE->isAltShuffle() &&
-              ((TE->getOpcode() == E->getOpcode() &&
-                TE->getAltOpcode() == E->getAltOpcode()) ||
-               (TE->getOpcode() == E->getAltOpcode() &&
-                TE->getAltOpcode() == E->getOpcode())) &&
-              TE->hasEqualOperands(*E))
-            return true;
-        }
-        return false;
-      };
-      if (TryFindNodeWithEqualOperands()) {
-        LLVM_DEBUG({
-          dbgs() << "SLP: diamond match for alternate node found.\n";
-          E->dump();
-        });
-        // No need to add new vector costs here since we're going to reuse
-        // same main/alternate vector ops, just do 
diff erent shuffling.
-      } else if (Instruction::isBinaryOp(E->getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
-        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
-                                               CostKind);
-      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
-        VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
-                                          Builder.getInt1Ty(),
-                                          CI0->getPredicate(), CostKind, VL0);
-        VecCost += TTI->getCmpSelInstrCost(
-            E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
-            cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
-            E->getAltOp());
-      } else {
-        Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
-        Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
-        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
-        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
-                                        TTI::CastContextHint::None, CostKind);
-        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
-                                         TTI::CastContextHint::None, CostKind);
-      }
-
-      if (E->ReuseShuffleIndices.empty()) {
-        CommonCost =
-            TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
-      } else {
-        SmallVector<int> Mask;
-        buildShuffleEntryMask(
-            E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
-            [E](Instruction *I) {
-              assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-              return I->getOpcode() == E->getAltOpcode();
-            },
-            Mask);
-        CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
-                                         FinalVecTy, Mask);
+        CommonCost += TTI->getInstructionCost(I, CostKind);
       }
-      LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
-      return CommonCost + VecCost - ScalarCost;
     }
-    default:
-      llvm_unreachable("Unknown instruction");
+    for (Value *V : VL) {
+      Instruction *I = cast<Instruction>(V);
+      assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+      ScalarCost += TTI->getInstructionCost(I, CostKind);
+    }
+    // VecCost is equal to sum of the cost of creating 2 vectors
+    // and the cost of creating shuffle.
+    InstructionCost VecCost = 0;
+    // Try to find the previous shuffle node with the same operands and same
+    // main/alternate ops.
+    auto TryFindNodeWithEqualOperands = [=]() {
+      for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
+        if (TE.get() == E)
+          break;
+        if (TE->isAltShuffle() &&
+            ((TE->getOpcode() == E->getOpcode() &&
+              TE->getAltOpcode() == E->getAltOpcode()) ||
+             (TE->getOpcode() == E->getAltOpcode() &&
+              TE->getAltOpcode() == E->getOpcode())) &&
+            TE->hasEqualOperands(*E))
+          return true;
+      }
+      return false;
+    };
+    if (TryFindNodeWithEqualOperands()) {
+      LLVM_DEBUG({
+        dbgs() << "SLP: diamond match for alternate node found.\n";
+        E->dump();
+      });
+      // No need to add new vector costs here since we're going to reuse
+      // same main/alternate vector ops, just do 
diff erent shuffling.
+    } else if (Instruction::isBinaryOp(E->getOpcode())) {
+      VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+      VecCost +=
+          TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
+    } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+      VecCost =
+          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+                                  CI0->getPredicate(), CostKind, VL0);
+      VecCost +=
+          TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+                                  cast<CmpInst>(E->getAltOp())->getPredicate(),
+                                  CostKind, E->getAltOp());
+    } else {
+      Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+      Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
+      auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+      auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+      VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+                                      TTI::CastContextHint::None, CostKind);
+      VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+                                       TTI::CastContextHint::None, CostKind);
+    }
+
+    if (E->ReuseShuffleIndices.empty()) {
+      CommonCost =
+          TTI->getShuffleCost(TargetTransformInfo::SK_Select, FinalVecTy);
+    } else {
+      SmallVector<int> Mask;
+      buildShuffleEntryMask(
+          E->Scalars, E->ReorderIndices, E->ReuseShuffleIndices,
+          [E](Instruction *I) {
+            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+            return I->getOpcode() == E->getAltOpcode();
+          },
+          Mask);
+      CommonCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                       FinalVecTy, Mask);
+    }
+    LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+    return CommonCost + VecCost - ScalarCost;
+  }
+  default:
+    llvm_unreachable("Unknown instruction");
   }
 }
 


        


More information about the llvm-commits mailing list