[llvm] [AArch64][CostModel] Consider the cost of const vector (PR #117539)

Wed Nov 27 03:54:29 PST 2024

https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/117539

>From c03ffd443441e595d589349642d71e995931d1a0 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Mon, 25 Nov 2024 16:14:17 +0530
Subject: [PATCH] [AArch64][CostModel] Consider the cost of const vector

Currently, we consider cost of const vector as zero. Consider the below e.g
```
%1 = add <2 x float> %1, <float 21.0, float 22.0>
```
Here, the cost of const vector <float 21.0, float 22.0> is considered zero.

However, this might not be the case. On AArch64 platform, this results in `adrp + ldr` instruction.

This patch alters the AArch64 cost-model to consider the cost of const vector.

Perf results(tested on `-mcpu = neoverse-v2`)(uplift indicated by + sign):
541.leela  +(2.2 - 3)%
---
 .../llvm/Analysis/TargetTransformInfo.h       |  21 ++++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  10 ++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  19 +++-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  13 +++
 .../AArch64/AArch64TargetTransformInfo.cpp    |  32 +++++-
 .../AArch64/AArch64TargetTransformInfo.h      |  20 +++-
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 106 +++++++++++++++---
 .../SLPVectorizer/AArch64/insertelement.ll    |  10 +-
 .../AArch64/memory-runtime-checks.ll          |  47 +++++---
 .../SLPVectorizer/AArch64/vec3-base.ll        |  11 +-
 .../SLPVectorizer/jumbled_store_crash.ll      |  80 ++++++++++++-
 .../materialize-vector-of-consts.ll           |  29 +++--
 .../multi-node-vectorized-insts.ll            |  45 +++++---
 13 files changed, 363 insertions(+), 80 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 985ca1532e0149..832ea12552c56b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1449,6 +1449,14 @@ class TargetTransformInfo {
                                             const APInt &DemandedDstElts,
                                             TTI::TargetCostKind CostKind) const;
 
+  /// \return The cost of materializing a constant vector.
+  InstructionCost getConstVectCost(unsigned Opcode, Type *Src, Align Alignment,
+                                   unsigned AddressSpace,
+                                   TTI::TargetCostKind CostKind,
+                                   OperandValueInfo OpdInfo,
+                                   const Instruction *I,
+                                   InstructionCost ConstVectScalarCost) const;
+
   /// \return The cost of Load and Store instructions.
   InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
@@ -2148,6 +2156,11 @@ class TargetTransformInfo::Concept {
                             TTI::TargetCostKind CostKind) = 0;
 
   virtual InstructionCost
+  getConstVectCost(unsigned Opcode, Type *Src, Align Alignment,
+                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
+                   OperandValueInfo OpInfo, const Instruction *I,
+                   InstructionCost ConstVectScalarCost) = 0;
+  virtual InstructionCost
   getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
                   OperandValueInfo OpInfo, const Instruction *I) = 0;
@@ -2850,6 +2863,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getReplicationShuffleCost(EltTy, ReplicationFactor, VF,
                                           DemandedDstElts, CostKind);
   }
+  InstructionCost
+  getConstVectCost(unsigned Opcode, Type *Src, Align Alignment,
+                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
+                   OperandValueInfo OpInfo, const Instruction *I,
+                   InstructionCost ConstVectScalarCost) override {
+    return Impl.getConstVectCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                 OpInfo, I, ConstVectScalarCost);
+  }
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 38aba183f6a173..b1abc34d8188eb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -730,6 +730,16 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost getConstVectCost(unsigned Opcode, Type *Src, Align Alignment,
+                                   unsigned AddressSpace,
+                                   TTI::TargetCostKind CostKind,
+                                   TTI::OperandValueInfo OpInfo,
+                                   const Instruction *I,
+                                   InstructionCost ConstVectScalarCost) const {
+    // Vector cost is considered same as Scalar Cost.
+    return ConstVectScalarCost;
+  }
+
   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
                                   unsigned AddressSpace,
                                   TTI::TargetCostKind CostKind,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index d2fc40d8ae037e..165978e88042cf 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1368,11 +1368,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return Cost;
   }
 
-  InstructionCost
-  getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                  unsigned AddressSpace, TTI::TargetCostKind CostKind,
-                  TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
-                  const Instruction *I = nullptr) {
+  InstructionCost getConstVectCost(unsigned Opcode, Type *Src,
+                                   MaybeAlign Alignment, unsigned AddressSpace,
+                                   TTI::TargetCostKind CostKind,
+                                   TTI::OperandValueInfo OpInfo,
+                                   const Instruction *I,
+                                   InstructionCost ConstVectScalarCost) {
+    return ConstVectScalarCost;
+  }
+
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr) {
     assert(!Src->isVoidTy() && "Invalid type");
     // Assume types, such as structs, are expensive.
     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1fb2b9836de0cc..3f4f18a7cf4316 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1099,6 +1099,19 @@ InstructionCost TargetTransformInfo::getReplicationShuffleCost(
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getConstVectCost(
+    unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
+    const Instruction *I, InstructionCost ConstVectScalarCost) const {
+  assert((I == nullptr || I->getOpcode() == Opcode) &&
+         "Opcode should reflect passed instruction.");
+  InstructionCost Cost =
+      TTIImpl->getConstVectCost(Opcode, Src, Alignment, AddressSpace, CostKind,
+                                OpInfo, I, ConstVectScalarCost);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getMemoryOpCost(
     unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
     TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 7a1e401bca18cb..2401a387f7edac 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3785,12 +3785,19 @@ bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
   return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
 }
 
-InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
-                                                MaybeAlign Alignment,
-                                                unsigned AddressSpace,
-                                                TTI::TargetCostKind CostKind,
-                                                TTI::OperandValueInfo OpInfo,
-                                                const Instruction *I) {
+// Return the cost of materializing a constant vector.
+InstructionCost AArch64TTIImpl::getConstVectCost(
+    unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
+    const Instruction *I, InstructionCost ConstVectScalarCost) {
+  return getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace, CostKind, OpInfo,
+                         I, ConstVectScalarCost);
+}
+
+InstructionCost AArch64TTIImpl::getMemoryOpCost(
+    unsigned Opcode, Type *Ty, MaybeAlign Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
+    const Instruction *I, InstructionCost ConstVectScalarCost) {
   EVT VT = TLI->getValueType(DL, Ty, true);
   // Type legalization can't handle structs
   if (VT == MVT::Other)
@@ -3801,6 +3808,11 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  // FIXME: Consider the cost of materializing const vector where the
+  // legalization cost > 1.
+  if (ConstVectScalarCost.isValid() && LT.first.getValue().value() > 1)
+    return ConstVectScalarCost;
+
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
@@ -3845,6 +3857,14 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
       // Otherwise we need to scalarize.
       return cast<FixedVectorType>(Ty)->getNumElements() * 2;
     }
+
+    // Const vector is lowered into `adrp + ldr`. This ldr is of the form
+    // "load vector reg, literal, S/D/Q forms" and is of very high latency.
+    // FIXME: This only considers the cost of ldr. Also consider the cost of
+    // adrp.
+    if (ConstVectScalarCost.isValid())
+      return 4;
+
     EVT EltVT = VT.getVectorElementType();
     unsigned EltSize = EltVT.getScalarSizeInBits();
     if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 201bc831b816b3..7b5850f5c87a90 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -23,6 +23,8 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/ProfileData/PGOCtxProfWriter.h"
+#include "llvm/Support/InstructionCost.h"
 #include <cstdint>
 #include <optional>
 
@@ -234,11 +236,19 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                                                     bool IsZeroCmp) const;
   bool useNeonVector(const Type *Ty) const;
 
-  InstructionCost
-  getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                  unsigned AddressSpace, TTI::TargetCostKind CostKind,
-                  TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
-                  const Instruction *I = nullptr);
+  InstructionCost getConstVectCost(unsigned Opcode, Type *Src,
+                                   MaybeAlign Alignment, unsigned AddressSpace,
+                                   TTI::TargetCostKind CostKind,
+                                   TTI::OperandValueInfo OpInfo,
+                                   const Instruction *I,
+                                   InstructionCost ConstVectScalarCost);
+
+  InstructionCost getMemoryOpCost(
+      unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind,
+      TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
+      const Instruction *I = nullptr,
+      InstructionCost ConstVectScalarCost = InstructionCost::getInvalid());
 
   InstructionCost getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d033b7c2ef4a92..0bd1933d8278a4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3117,7 +3117,8 @@ class BoUpSLP {
   /// roots. This method calculates the cost of extracting the values.
   /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
   InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
-                                Type *ScalarTy) const;
+                                Type *ScalarTy,
+                                const TreeEntry *E = nullptr) const;
 
   /// Set the Builder insert point to one after the last instruction in
   /// the bundle
@@ -3681,7 +3682,24 @@ class BoUpSLP {
       if (AllConstsOrCasts)
         CastMaxMinBWSizes =
             std::make_pair(std::numeric_limits<unsigned>::max(), 1);
-      MustGather.insert(VL.begin(), VL.end());
+      // Recording all constants entry helps in avoiding counting the cost of
+      // const vector twice.
+      if (allConstant(VL) && !isSplat(VL)) {
+        for (Value *V : VL) {
+          const TreeEntry *TE = getTreeEntry(V);
+          assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
+                 "Scalar already in tree!");
+          if (TE) {
+            if (TE != Last)
+              MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
+                  Last);
+            continue;
+          }
+          ScalarToTreeEntry[V] = Last;
+        }
+      } else {
+        MustGather.insert(VL.begin(), VL.end());
+      }
     }
 
     if (UserTreeIdx.UserTE)
@@ -10053,8 +10071,10 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     return Constant::getAllOnesValue(Ty);
   }
 
-  InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
-    if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
+  InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root,
+                                     const TreeEntry *E = nullptr) {
+    if ((!Root && allConstant(VL) && isSplat(VL)) ||
+        all_of(VL, IsaPred<UndefValue>))
       return TTI::TCC_Free;
     auto *VecTy = getWidenedType(ScalarTy, VL.size());
     InstructionCost GatherCost = 0;
@@ -10098,7 +10118,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
            (all_of(Gathers, IsaPred<UndefValue>)
                 ? TTI::TCC_Free
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
-                                  ScalarTy));
+                                  ScalarTy, E));
   };
 
   /// Compute the cost of creating a vector containing the extracted values from
@@ -10801,8 +10821,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         CommonMask[Idx] = Mask[Idx] + VF;
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
-                Value *Root = nullptr) {
-    Cost += getBuildVectorCost(VL, Root);
+                Value *Root = nullptr, const TreeEntry *E = nullptr) {
+    Cost += getBuildVectorCost(VL, Root, E);
     if (!Root) {
       // FIXME: Need to find a way to avoid use of getNullValue here.
       SmallVector<Constant *> Vals;
@@ -11019,8 +11039,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->isGather()) {
-    if (allConstant(VL))
-      return 0;
     if (isa<InsertElementInst>(VL[0]))
       return InstructionCost::getInvalid();
     if (isa<CmpInst>(VL.front()))
@@ -12557,7 +12575,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
       auto *Inst = cast<Instruction>(EU.Scalar);
       InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
       auto OperandIsScalar = [&](Value *V) {
-        if (!getTreeEntry(V)) {
+        if (auto *TE = getTreeEntry(V);
+            // All constants entry does not result in a seperate instruction.
+            // Ignore such entry.
+            !TE || (TE && allConstant(TE->Scalars))) {
           // Some extractelements might be not vectorized, but
           // transformed into shuffle and removed from the function,
           // consider it here.
@@ -13425,7 +13446,8 @@ BoUpSLP::isGatherShuffledEntry(
 }
 
 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
-                                       Type *ScalarTy) const {
+                                       Type *ScalarTy,
+                                       const TreeEntry *E) const {
   auto *VecTy = getWidenedType(ScalarTy, VL.size());
   bool DuplicateNonConst = false;
   // Find the cost of inserting/extracting values from the vector.
@@ -13479,6 +13501,56 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
           Cost += TTI->getShuffleCost(
               TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
               I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
+    } else if (allConstant(VL)) {
+      auto IsAllowedScalarTy = [&](const Type *T) {
+        return T->isFloatTy() || T->isDoubleTy() || T->isIntegerTy();
+      };
+      if (IsAllowedScalarTy(VL[0]->getType())) {
+        InstructionCost ScalarCost, VectorCost;
+
+        auto IsDuplicateEntry = [this](const TreeEntry *E) {
+          auto *TE = getTreeEntry(E->Scalars[0]);
+          if (TE != E && TE->isSame(E->Scalars))
+            return true;
+          else {
+            auto It = MultiNodeScalars.find(E->Scalars[0]);
+            if (It != MultiNodeScalars.end()) {
+              auto *TEIt = find_if(It->getSecond(), [E](TreeEntry *ME) {
+                return ME != E && ME->isSame(E->Scalars[0]);
+              });
+              if (TEIt != It->getSecond().end())
+                return true;
+            }
+          }
+          return false;
+        };
+
+        // FIXME: If there is more than 1 SLP tree realizing the same const
+        // vector, codegen will realize it only once. Hence, no need to consider
+        // the cost of const vector twice. But, currently we can't check if the
+        // tree entry is present in other SLP tree.
+        // FIXME: A tree entry can be mix of non-const and consts and the final
+        // vector may be realized as shuffle of consts and non-consts. In such a
+        // case, the const vector, passed as VL here, can contain poison and
+        // VL != E->Scalars. So, we can't check for duplicate entries. Check if
+        // we are realizing this const vector twice.
+        assert(E && "TreeEntry E must point to a valid entry.");
+        if (E->isSame(VL) && !isSplat(VL) && !all_of(VL, IsaPred<UndefValue>) &&
+            !IsDuplicateEntry(E)) {
+          // Get unique scalars
+          SmallDenseSet<Value *> UniqScalars;
+          for (auto *V : VL)
+            UniqScalars.insert(V);
+
+          // Constant is realized by having a mov/fmov into GPR. So,
+          // ScalarCost = #UniqScalars
+          ScalarCost = (UniqScalars.size());
+          VectorCost = TTI->getConstVectCost(
+              Instruction::Load, VecTy, Align(), 0, CostKind,
+              TTI::OperandValueInfo(), nullptr, ScalarCost);
+        }
+        Cost = VectorCost - ScalarCost;
+      }
     } else {
       Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
                                            /*Insert*/ true,
@@ -14275,7 +14347,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
     add(V1, NewMask);
   }
   Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
-                Value *Root = nullptr) {
+                Value *Root = nullptr, const TreeEntry *E = nullptr) {
     return R.gather(VL, Root, ScalarTy,
                     [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
                       return createShuffle(V1, V2, Mask);
@@ -15005,7 +15077,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
     if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
       SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
       TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
-      Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
+      Value *BV =
+          ShuffleBuilder.gather(GatheredScalars, BVMask.size(), nullptr, E);
       ShuffleBuilder.add(BV, BVMask);
     }
     if (all_of(NonConstants, [=](Value *V) {
@@ -15020,13 +15093,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
           E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
           [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
             TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
-            Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
+            Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec, E);
           });
   } else if (!allConstant(GatheredScalars)) {
     // Gather unique scalars and all constants.
     SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
     TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
-    Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
+    Value *BV =
+        ShuffleBuilder.gather(GatheredScalars, ReuseMask.size(), nullptr, E);
     ShuffleBuilder.add(BV, ReuseMask);
     Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
                                   SubVectorsMask);
@@ -15037,7 +15111,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
       if (!isa<PoisonValue>(V))
         Mask[I] = I;
     }
-    Value *BV = ShuffleBuilder.gather(GatheredScalars);
+    Value *BV = ShuffleBuilder.gather(GatheredScalars, 0, nullptr, E);
     ShuffleBuilder.add(BV, Mask);
     Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
                                   SubVectorsMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
index 1198bb1d509ebb..d6d46e54456142 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll
@@ -39,10 +39,16 @@ declare float @llvm.fabs.f32(float)
 
 define <4 x float> @insertelement_poison_lanes(ptr %0) {
 ; CHECK-LABEL: @insertelement_poison_lanes(
+; CHECK-NEXT:    [[TRUNC_1:%.*]] = fptrunc double 0.000000e+00 to float
+; CHECK-NEXT:    [[TRUNC_2:%.*]] = fptrunc double 1.000000e+00 to float
 ; CHECK-NEXT:    [[INS_1:%.*]] = insertelement <4 x float> zeroinitializer, float poison, i64 0
-; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <4 x float> [[INS_1]], float 0.000000e+00, i64 0
+; CHECK-NEXT:    [[INS_2:%.*]] = insertelement <4 x float> [[INS_1]], float [[TRUNC_1]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = fpext float [[TRUNC_1]] to double
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr double, ptr [[TMP0:%.*]], i64 1
-; CHECK-NEXT:    store <2 x double> <double 0.000000e+00, double 1.000000e+00>, ptr [[GEP_1]], align 8
+; CHECK-NEXT:    store double [[EXT_1]], ptr [[GEP_1]], align 8
+; CHECK-NEXT:    [[EXT_2:%.*]] = fpext float [[TRUNC_2]] to double
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    store double [[EXT_2]], ptr [[GEP_2]], align 8
 ; CHECK-NEXT:    ret <4 x float> [[INS_2]]
 ;
   %trunc.1 = fptrunc double 0.000000e+00 to float
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 9f5744b17cb79e..39f3864820a86b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -600,15 +600,27 @@ bb15:                                             ; preds = %bb15, %bb14
 define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
 ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 300>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = fptosi float [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 10 to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 2, [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3
@@ -654,17 +666,18 @@ bb23:
 define void @single_membound(ptr %arg, ptr %arg1, double %x) {
 ; CHECK-LABEL: @single_membound(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
 ; CHECK-NEXT:    store double [[TMP]], ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = load double, ptr [[ARG1:%.*]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 2
 ; CHECK-NEXT:    br label [[BB15:%.*]]
 ; CHECK:       bb15:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 2.000000e+01, double 3.000000e+01>
-; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01
+; CHECK-NEXT:    store double [[TMP16]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01
+; CHECK-NEXT:    store double [[TMP17]], ptr [[TMP14]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1231,25 +1244,27 @@ define void @crash_no_tracked_instructions(ptr %arg, ptr %arg.2, ptr %arg.3, i1
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[T19:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
 ; CHECK-NEXT:    [[T20:%.*]] = load float, ptr [[ARG_3:%.*]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x float> <float 0.000000e+00, float poison>, float [[T20]], i32 1
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]]
 ; CHECK:       bb22:
 ; CHECK-NEXT:    [[T23:%.*]] = fmul float [[T20]], 9.900000e+01
+; CHECK-NEXT:    [[T24:%.*]] = fmul float [[T23]], 9.900000e+01
 ; CHECK-NEXT:    [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2
 ; CHECK-NEXT:    [[T26:%.*]] = fmul float [[T23]], 1.000000e+01
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 9.900000e+01, float 1.000000e+01>
 ; CHECK-NEXT:    store float [[T26]], ptr [[T25]], align 4
 ; CHECK-NEXT:    [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], splat (float 2.000000e+01)
+; CHECK-NEXT:    [[T28:%.*]] = fadd float [[T24]], 2.000000e+01
+; CHECK-NEXT:    [[T29:%.*]] = fadd float [[T26]], 2.000000e+01
 ; CHECK-NEXT:    br label [[BB30]]
 ; CHECK:       bb30:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x float> [ [[TMP4]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[BB36:%.*]]
 ; CHECK:       bb36:
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], splat (float 3.000000e+00)
-; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[ARG_3]], align 4
+; CHECK-NEXT:    [[T37:%.*]] = fmul float [[T31]], 3.000000e+00
+; CHECK-NEXT:    store float [[T37]], ptr [[ARG_3]], align 4
+; CHECK-NEXT:    [[T39:%.*]] = fmul float [[T32]], 3.000000e+00
+; CHECK-NEXT:    [[T40:%.*]] = getelementptr inbounds float, ptr [[ARG_3]], i64 1
+; CHECK-NEXT:    store float [[T39]], ptr [[T40]], align 4
 ; CHECK-NEXT:    br label [[BB41:%.*]]
 ; CHECK:       bb41:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
index feb4ad865f3147..289ad5f62c966f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-base.ll
@@ -279,10 +279,13 @@ define void @phi_store3(ptr %dst) {
 ; POW2-ONLY:       invoke.cont8.loopexit:
 ; POW2-ONLY-NEXT:    br label [[EXIT]]
 ; POW2-ONLY:       exit:
-; POW2-ONLY-NEXT:    [[P_2:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
-; POW2-ONLY-NEXT:    [[TMP0:%.*]] = phi <2 x i32> [ <i32 1, i32 2>, [[ENTRY]] ], [ poison, [[INVOKE_CONT8_LOOPEXIT]] ]
-; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 2
-; POW2-ONLY-NEXT:    store <2 x i32> [[TMP0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    [[P_0:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT:%.*]] ]
+; POW2-ONLY-NEXT:    [[P_1:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT]] ]
+; POW2-ONLY-NEXT:    [[P_2:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ 0, [[INVOKE_CONT8_LOOPEXIT]] ]
+; POW2-ONLY-NEXT:    [[DST_1:%.*]] = getelementptr i32, ptr [[DST:%.*]], i32 1
+; POW2-ONLY-NEXT:    [[DST_2:%.*]] = getelementptr i32, ptr [[DST]], i32 2
+; POW2-ONLY-NEXT:    store i32 [[P_0]], ptr [[DST]], align 4
+; POW2-ONLY-NEXT:    store i32 [[P_1]], ptr [[DST_1]], align 4
 ; POW2-ONLY-NEXT:    store i32 [[P_2]], ptr [[DST_2]], align 4
 ; POW2-ONLY-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/jumbled_store_crash.ll
index 50cc97a529f5fb..4f31e9f50f5e37 100644
--- a/llvm/test/Transforms/SLPVectorizer/jumbled_store_crash.ll
+++ b/llvm/test/Transforms/SLPVectorizer/jumbled_store_crash.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -o - -S < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -o - -S < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -o - -S < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -o - -S < %s | FileCheck %s --check-prefix=AARCH64 %}
 
 @b = common dso_local global ptr null, align 8
 @e = common dso_local global float 0.000000e+00, align 4
@@ -47,6 +47,82 @@ define dso_local void @j() local_unnamed_addr {
 ; CHECK-NEXT:    store <4 x i32> [[TMP23]], ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    ret void
 ;
+; X86-LABEL: @j(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr @b, align 8
+; X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; X86-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12
+; X86-NEXT:    [[TMP1:%.*]] = load i32, ptr @a, align 4
+; X86-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP1]] to float
+; X86-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
+; X86-NEXT:    [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX1]], align 4
+; X86-NEXT:    [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP2]]
+; X86-NEXT:    [[TMP5:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x float>
+; X86-NEXT:    [[TMP6:%.*]] = fmul <2 x float> [[TMP5]], splat (float 1.000000e+01)
+; X86-NEXT:    [[TMP7:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP6]]
+; X86-NEXT:    [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; X86-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP8]], i32 1
+; X86-NEXT:    store float [[TMP9]], ptr @g, align 4
+; X86-NEXT:    [[TMP10:%.*]] = fadd <4 x float> [[TMP8]], <float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; X86-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2
+; X86-NEXT:    store float [[TMP11]], ptr @c, align 4
+; X86-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0
+; X86-NEXT:    store float [[TMP12]], ptr @d, align 4
+; X86-NEXT:    [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3
+; X86-NEXT:    store float [[TMP13]], ptr @e, align 4
+; X86-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1
+; X86-NEXT:    store float [[TMP14]], ptr @f, align 4
+; X86-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> <float poison, float -1.000000e+00, float poison, float -1.000000e+00>, float [[CONV19]], i32 0
+; X86-NEXT:    [[TMP16:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; X86-NEXT:    [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; X86-NEXT:    [[TMP18:%.*]] = fsub <4 x float> [[TMP10]], [[TMP17]]
+; X86-NEXT:    [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP17]]
+; X86-NEXT:    [[TMP20:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP19]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; X86-NEXT:    [[TMP21:%.*]] = fptosi <4 x float> [[TMP20]] to <4 x i32>
+; X86-NEXT:    store <4 x i32> [[TMP21]], ptr [[ARRAYIDX1]], align 4
+; X86-NEXT:    ret void
+;
+; AARCH64-LABEL: @j(
+; AARCH64-NEXT:  entry:
+; AARCH64-NEXT:    [[TMP0:%.*]] = load ptr, ptr @b, align 8
+; AARCH64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; AARCH64-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12
+; AARCH64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 13
+; AARCH64-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 14
+; AARCH64-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
+; AARCH64-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX1]], align 4
+; AARCH64-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP1]]
+; AARCH64-NEXT:    [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float>
+; AARCH64-NEXT:    [[TMP5:%.*]] = fmul <2 x float> [[TMP4]], splat (float 1.000000e+01)
+; AARCH64-NEXT:    [[TMP6:%.*]] = fsub <2 x float> <float 1.000000e+00, float 0.000000e+00>, [[TMP5]]
+; AARCH64-NEXT:    [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; AARCH64-NEXT:    store float [[TMP7]], ptr @g, align 4
+; AARCH64-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP6]], splat (float 1.000000e+00)
+; AARCH64-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; AARCH64-NEXT:    store float [[TMP9]], ptr @c, align 4
+; AARCH64-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP6]], i32 1
+; AARCH64-NEXT:    [[SUB10:%.*]] = fadd float [[TMP10]], -1.000000e+00
+; AARCH64-NEXT:    store float [[SUB10]], ptr @d, align 4
+; AARCH64-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; AARCH64-NEXT:    store float [[TMP11]], ptr @e, align 4
+; AARCH64-NEXT:    [[SUB12:%.*]] = fadd float [[TMP7]], -1.000000e+00
+; AARCH64-NEXT:    store float [[SUB12]], ptr @f, align 4
+; AARCH64-NEXT:    [[TMP12:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> <float -1.000000e+00, float poison>, <2 x i32> <i32 2, i32 1>
+; AARCH64-NEXT:    [[TMP13:%.*]] = fsub <2 x float> [[TMP8]], [[TMP12]]
+; AARCH64-NEXT:    [[TMP14:%.*]] = fadd <2 x float> [[TMP8]], [[TMP12]]
+; AARCH64-NEXT:    [[TMP15:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x i32> <i32 1, i32 2>
+; AARCH64-NEXT:    [[TMP16:%.*]] = fptosi <2 x float> [[TMP15]] to <2 x i32>
+; AARCH64-NEXT:    store <2 x i32> [[TMP16]], ptr [[ARRAYIDX15]], align 4
+; AARCH64-NEXT:    [[TMP17:%.*]] = load i32, ptr @a, align 4
+; AARCH64-NEXT:    [[CONV19:%.*]] = sitofp i32 [[TMP17]] to float
+; AARCH64-NEXT:    [[SUB20:%.*]] = fsub float [[SUB10]], [[CONV19]]
+; AARCH64-NEXT:    [[CONV21:%.*]] = fptosi float [[SUB20]] to i32
+; AARCH64-NEXT:    store i32 [[CONV21]], ptr [[ARRAYIDX1]], align 4
+; AARCH64-NEXT:    [[SUB23:%.*]] = fadd float [[SUB12]], -1.000000e+00
+; AARCH64-NEXT:    [[CONV24:%.*]] = fptosi float [[SUB23]] to i32
+; AARCH64-NEXT:    store i32 [[CONV24]], ptr [[ARRAYIDX3]], align 4
+; AARCH64-NEXT:    ret void
+;
 entry:
   %0 = load ptr, ptr @b, align 8
   %arrayidx = getelementptr inbounds i32, ptr %0, i64 4
diff --git a/llvm/test/Transforms/SLPVectorizer/materialize-vector-of-consts.ll b/llvm/test/Transforms/SLPVectorizer/materialize-vector-of-consts.ll
index 2f58bd25b75647..f14eaf8cb4c326 100644
--- a/llvm/test/Transforms/SLPVectorizer/materialize-vector-of-consts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/materialize-vector-of-consts.ll
@@ -4,9 +4,10 @@
 define <2 x float> @v2f32_diff_consts(float %a, float %b)
 ; CHECK-LABEL: define <2 x float> @v2f32_diff_consts(
 ; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[B]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], <float 2.200000e+01, float 2.300000e+01>
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float [[A]], 2.200000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul float [[B]], 2.300000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP2]], i32 1
 ; CHECK-NEXT:    ret <2 x float> [[TMP3]]
 ;
 {
@@ -33,6 +34,8 @@ define <2 x float> @v2f32_const_splat(float %a, float %b)
   ret <2 x float> %4
 }
 
+; This needs type legalization since <4 x double> won't fit into a single register.
+; So, we bail out for now while calculating the cost of vector of constants.
 define <4 x double> @v4f64_illegal_type(double %a, double %b, double %c, double %d)
 ; CHECK-LABEL: define <4 x double> @v4f64_illegal_type(
 ; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]], double [[D:%.*]]) {
@@ -55,15 +58,22 @@ define <4 x double> @v4f64_illegal_type(double %a, double %b, double %c, double
   ret <4 x double> %8
 }
 
+; Here, we have 2 SLP trees. Both calculate the cost of <2 x double><double 21.0, double 22.0>
+; seperately/individually and hence, both the trees are not vectorized. But, in terms of codegen,
+; this const vector needs to be realized only once and hence, considering the cost of const
+; vector twice is inappropriate.
+; But, suprisingly, llvm-mca for -mtriple=aarch64 shows scalar version to be slightly better.
 define <2 x double> @v2f64_dup_const_vector_case1(double %a, double %b, double %c, double %d)
 ; CHECK-LABEL: define <2 x double> @v2f64_dup_const_vector_case1(
 ; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]], double [[D:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 2.100000e+01, double 2.200000e+01>
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 2.100000e+01, double 2.200000e+01>
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[A]], 2.100000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul double [[B]], 2.200000e+01
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul double [[C]], 2.100000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul double [[D]], 2.200000e+01
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP5]], double [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP4]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
 ; CHECK-NEXT:    ret <2 x double> [[TMP7]]
 ;
@@ -80,6 +90,7 @@ define <2 x double> @v2f64_dup_const_vector_case1(double %a, double %b, double %
   ret <2 x double> %9
 }
 
+; llvm-mca for -mtriple=aarch64 shows scalar version to be only slightly better.
 define <2 x double> @v2f64_dup_const_vector_case2(double %a, double %b, double %c, double %d)
 ; CHECK-LABEL: define <2 x double> @v2f64_dup_const_vector_case2(
 ; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]], double [[D:%.*]]) {
diff --git a/llvm/test/Transforms/SLPVectorizer/multi-node-vectorized-insts.ll b/llvm/test/Transforms/SLPVectorizer/multi-node-vectorized-insts.ll
index 8abc6ef236a3c0..7b272454ccb3fe 100644
--- a/llvm/test/Transforms/SLPVectorizer/multi-node-vectorized-insts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/multi-node-vectorized-insts.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-X86 %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-AARCH64 %}
 
 define void @test(double %0) {
 ; CHECK-LABEL: define void @test(
@@ -78,19 +78,34 @@ define void @test1(double %0, <4 x double> %v) {
 }
 
 define void @test2(double %0) {
-; CHECK-LABEL: define void @test2(
-; CHECK-SAME: double [[TMP0:%.*]]) {
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[TMP4:%.*]]
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub <2 x double> <double 3.000000e+00, double 2.000000e+00>, [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub <2 x double> <double 3.000000e+00, double 1.000000e+00>, [[TMP3]]
-; CHECK-NEXT:    br label [[DOTBACKEDGE:%.*]]
-; CHECK:       .backedge:
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    br label [[TMP4]]
+; CHECK-X86-LABEL: define void @test2(
+; CHECK-X86-SAME: double [[TMP0:%.*]]) {
+; CHECK-X86-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
+; CHECK-X86-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-X86-NEXT:    br label [[TMP4:%.*]]
+; CHECK-X86:       4:
+; CHECK-X86-NEXT:    [[TMP5:%.*]] = fsub <2 x double> <double 3.000000e+00, double 2.000000e+00>, [[TMP3]]
+; CHECK-X86-NEXT:    [[TMP6:%.*]] = fsub <2 x double> <double 3.000000e+00, double 1.000000e+00>, [[TMP3]]
+; CHECK-X86-NEXT:    br label [[DOTBACKEDGE:%.*]]
+; CHECK-X86:       .backedge:
+; CHECK-X86-NEXT:    [[TMP7:%.*]] = fmul <2 x double> [[TMP5]], [[TMP6]]
+; CHECK-X86-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], zeroinitializer
+; CHECK-X86-NEXT:    br label [[TMP4]]
+;
+; CHECK-AARCH64-LABEL: define void @test2(
+; CHECK-AARCH64-SAME: double [[TMP0:%.*]]) {
+; CHECK-AARCH64-NEXT:    br label [[TMP2:%.*]]
+; CHECK-AARCH64:       2:
+; CHECK-AARCH64-NEXT:    [[TMP3:%.*]] = fsub double 1.000000e+00, [[TMP0]]
+; CHECK-AARCH64-NEXT:    [[TMP4:%.*]] = fsub double 2.000000e+00, [[TMP0]]
+; CHECK-AARCH64-NEXT:    [[TMP5:%.*]] = fsub double 3.000000e+00, [[TMP0]]
+; CHECK-AARCH64-NEXT:    br label [[DOTBACKEDGE:%.*]]
+; CHECK-AARCH64:       .backedge:
+; CHECK-AARCH64-NEXT:    [[TMP6:%.*]] = fmul double [[TMP4]], [[TMP3]]
+; CHECK-AARCH64-NEXT:    [[TMP7:%.*]] = fcmp olt double [[TMP6]], 0.000000e+00
+; CHECK-AARCH64-NEXT:    [[TMP8:%.*]] = fmul double [[TMP5]], [[TMP5]]
+; CHECK-AARCH64-NEXT:    [[TMP9:%.*]] = fcmp olt double [[TMP8]], 0.000000e+00
+; CHECK-AARCH64-NEXT:    br label [[TMP2]]
 ;
   br label %2