[llvm] [CostModel][AArch64] Make extractelement, with fmul user, free whenev… (PR #111479)

Wed Oct 9 00:57:16 PDT 2024

https://github.com/sushgokh updated https://github.com/llvm/llvm-project/pull/111479

>From 85ca8b215cd5ae51f1ac772d0ecb7e551bef9d55 Mon Sep 17 00:00:00 2001
From: sgokhale <sgokhale at nvidia.com>
Date: Tue, 8 Oct 2024 10:02:40 +0530
Subject: [PATCH] [CostModel][AArch64] Make extractelement, with fmul user,
 free whenever possible

In case of Neon, if there exists extractelement from lane != 0 such that
  1. extractelement does not necessitate a move from vector_reg -> GPR.
  2. extractelement result feeds into fmul.
  3. Other operand of fmul is a scalar or extractelement from lane 0 or lane equivalent to 0.
  then the extractelement can be merged with fmul in the backend and it incurs no cost.
  e.g.
  define double @foo(<2 x double> %a) {
    %1 = extractelement <2 x double> %a, i32 0
    %2 = extractelement <2 x double> %a, i32 1
    %res = fmul double %1, %2    ret double %res
  }
  %2 and %res can be merged in the backend to generate:
  fmul    d0, d0, v0.d[1]

The change was tested with SPEC FP(C/C++) on Neoverse-v2.
Compile time impact: None
Performance impact: Observing 1.3-1.7% uplift on lbm benchmark with -flto depending upon the config.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  26 +++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   7 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  15 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  13 ++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 171 +++++++++++++++++-
 .../AArch64/AArch64TargetTransformInfo.h      |  13 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  12 +-
 .../CostModel/AArch64/extract_float.ll        |  29 +--
 .../SLPVectorizer/consecutive-access.ll       |  70 +++----
 9 files changed, 284 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 89a85bc8a90864..8ed16284bb67f2 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -22,6 +22,8 @@
 #define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/FMF.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
@@ -1392,6 +1394,16 @@ class TargetTransformInfo {
                                      unsigned Index = -1, Value *Op0 = nullptr,
                                      Value *Op1 = nullptr) const;
 
+  /// \return The expected cost of vector Insert and Extract.
+  /// Use -1 to indicate that there is no information on the index value.
+  /// This is used when the instruction is not available; a typical use
+  /// case is to provision the cost of vectorization/scalarization in
+  /// vectorizer passes.
+  InstructionCost getVectorInstrCost(
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      Value *Scalar,
+      const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const;
+
   /// \return The expected cost of vector Insert and Extract.
   /// This is used when instruction is available, and implementation
   /// asserts 'I' is not nullptr.
@@ -2062,6 +2074,12 @@ class TargetTransformInfo::Concept {
                                              TTI::TargetCostKind CostKind,
                                              unsigned Index, Value *Op0,
                                              Value *Op1) = 0;
+
+  virtual InstructionCost getVectorInstrCost(
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      Value *Scalar,
+      const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) = 0;
+
   virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                              TTI::TargetCostKind CostKind,
                                              unsigned Index) = 0;
@@ -2726,6 +2744,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                                      Value *Op1) override {
     return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
   }
+  InstructionCost
+  getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind,
+                     unsigned Index, Value *Scalar,
+                     const ArrayRef<std::tuple<Value *, User *, int>>
+                         ScalarUserAndIdx) override {
+    return Impl.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
+                                   ScalarUserAndIdx);
+  }
   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 50040dc8f6165b..a161a758765d8f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -683,6 +683,13 @@ class TargetTransformInfoImplBase {
     return 1;
   }
 
+  InstructionCost getVectorInstrCost(
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      Value *Scalar,
+      const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
+    return 1;
+  }
+
   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c36a346c1b2e05..dbb269bee04f70 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1277,12 +1277,21 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return 1;
   }
 
-  InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
-                                     TTI::TargetCostKind CostKind,
-                                     unsigned Index, Value *Op0, Value *Op1) {
+  virtual InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
+                                             TTI::TargetCostKind CostKind,
+                                             unsigned Index, Value *Op0,
+                                             Value *Op1) {
     return getRegUsageForType(Val->getScalarType());
   }
 
+  InstructionCost getVectorInstrCost(
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      Value *Scalar,
+      const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx
+      ) {
+    return getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr, nullptr);
+  }
+
   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index b5195f764cbd1c..b50e2aa554ba0a 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1037,6 +1037,19 @@ InstructionCost TargetTransformInfo::getVectorInstrCost(
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getVectorInstrCost(
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    Value *Scalar,
+    const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
+  // FIXME: Assert that Opcode is either InsertElement or ExtractElement.
+  // This is mentioned in the interface description and respected by all
+  // callers, but never asserted upon.
+  InstructionCost Cost = TTIImpl->getVectorInstrCost(
+      Opcode, Val, CostKind, Index, Scalar, ScalarUserAndIdx);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost
 TargetTransformInfo::getVectorInstrCost(const Instruction &I, Type *Val,
                                         TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 80d5168ae961ab..a3488e8d8e7ac3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -10,20 +10,28 @@
 #include "AArch64ExpandImm.h"
 #include "AArch64PerfectShuffle.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/IVDescriptors.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/User.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
+#include <cassert>
 #include <optional>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -3145,12 +3153,16 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
   return 0;
 }
 
-InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
-                                                         Type *Val,
-                                                         unsigned Index,
-                                                         bool HasRealUse) {
+InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
+    std::variant<const Instruction *, const unsigned> InstOrOpcode, Type *Val,
+    unsigned Index, bool HasRealUse, Value *Scalar,
+    const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
   assert(Val->isVectorTy() && "This must be a vector type");
 
+  const auto *I = (std::holds_alternative<const Instruction *>(InstOrOpcode)
+                       ? get<const Instruction *>(InstOrOpcode)
+                       : nullptr);
+
   if (Index != -1U) {
     // Legalize the type.
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
@@ -3194,6 +3206,149 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
     // compile-time considerations.
   }
 
+  // In case of Neon, if there exists extractelement from lane != 0 such that
+  // 1. extractelement does not necessitate a move from vector_reg -> GPR.
+  // 2. extractelement result feeds into fmul.
+  // 3. Other operand of fmul is a scalar or extractelement from lane 0 or lane
+  // equivalent to 0.
+  // then the extractelement can be merged with fmul in the backend and it
+  // incurs no cost.
+  // e.g.
+  // define double @foo(<2 x double> %a) {
+  //   %1 = extractelement <2 x double> %a, i32 0
+  //   %2 = extractelement <2 x double> %a, i32 1
+  //   %res = fmul double %1, %2
+  //   ret double %res
+  // }
+  // %2 and %res can be merged in the backend to generate fmul v0, v0, v1.d[1]
+  auto ExtractCanFuseWithFmul = [&]() {
+    // We bail out if the extract is from lane 0.
+    if (Index == 0)
+      return false;
+
+    // Check if the scalar element type of the vector operand of ExtractElement
+    // instruction is one of the allowed types.
+    auto IsAllowedScalarTy = [&](const Type *T) {
+      return T->isFloatTy() || T->isDoubleTy() ||
+             (T->isHalfTy() && ST->hasFullFP16());
+    };
+
+    // Check if the extractelement user is scalar fmul.
+    auto IsUserFMulScalarTy = [](const Value *EEUser) {
+      // Check if the user is scalar fmul.
+      const auto *BO = dyn_cast_if_present<BinaryOperator>(EEUser);
+      return BO && BO->getOpcode() == BinaryOperator::FMul &&
+             !BO->getType()->isVectorTy();
+    };
+
+    // InstCombine combines fmul with fadd/fsub. Hence, extractelement fusion
+    // with fmul does not happen.
+    auto IsFMulUserFAddFSub = [](const Value *FMul) {
+      return any_of(FMul->users(), [](const User *U) {
+        const auto *BO = dyn_cast_if_present<BinaryOperator>(U);
+        return (BO && (BO->getOpcode() == BinaryOperator::FAdd ||
+                       BO->getOpcode() == BinaryOperator::FSub));
+      });
+    };
+
+    // Check if the type constraints on input vector type and result scalar type
+    // of extractelement instruction are satisfied.
+    auto TypeConstraintsOnEESatisfied =
+        [&IsAllowedScalarTy](const Type *VectorTy, const Type *ScalarTy) {
+          return isa<FixedVectorType>(VectorTy) && IsAllowedScalarTy(ScalarTy);
+        };
+
+    // Check if the extract index is from lane 0 or lane equivalent to 0 for a
+    // certain scalar type and a certain vector register width.
+    auto IsExtractLaneEquivalentToZero = [&](const unsigned &Idx,
+                                             const unsigned &EltSz) {
+      auto RegWidth =
+          getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+              .getFixedValue();
+      return (Idx == 0 || (Idx * EltSz) % RegWidth == 0);
+    };
+
+    if (std::holds_alternative<const unsigned>(InstOrOpcode)) {
+      if (!TypeConstraintsOnEESatisfied(Val, Val->getScalarType()))
+        return false;
+
+      for (auto &RefT : ScalarUserAndIdx) {
+        Value *RefS = get<0>(RefT);
+        User *RefU = get<1>(RefT);
+        const int &RefL = get<2>(RefT);
+
+        // Analayze all the users which have same scalar/index as Scalar/Index.
+        if (RefS != Scalar || RefL != Index)
+          continue;
+
+        // Check if the user of {Scalar, Index} pair is fmul user.
+        if (!IsUserFMulScalarTy(RefU) || IsFMulUserFAddFSub(RefU))
+          return false;
+
+        // For RefU, check if the other operand is extract from the same SLP
+        // tree. If not, we bail out since we can't analyze extracts from other
+        // SLP tree.
+        unsigned NumExtractEltsIntoUser = 0;
+        for (auto &CmpT : ScalarUserAndIdx) {
+          User *CmpU = get<1>(CmpT);
+          if (CmpT == RefT || CmpU != RefU)
+            continue;
+          Value *CmpS = get<0>(CmpT);
+          ++NumExtractEltsIntoUser;
+          const int &CmpL = get<2>(CmpT);
+          if (!IsExtractLaneEquivalentToZero(CmpL, Val->getScalarSizeInBits()))
+            return false;
+        }
+        // We know this is fmul user with just 2 operands, one being RefT. If we
+        // can't find CmpT, as the other operand, then bail out.
+        if (NumExtractEltsIntoUser != 1)
+          return false;
+      }
+    } else {
+      const auto *EE = cast<ExtractElementInst>(I);
+
+      const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
+      if (!IdxOp)
+        return false;
+
+      if (!TypeConstraintsOnEESatisfied(EE->getVectorOperand()->getType(),
+                                        EE->getType()))
+        return false;
+
+      return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
+        if (!IsUserFMulScalarTy(U) || IsFMulUserFAddFSub(U))
+          return false;
+
+        // Check if the other operand of extractelement is also extractelement
+        // from lane equivalent to 0.
+        const auto *BO = cast<BinaryOperator>(U);
+        const auto *OtherEE = dyn_cast<ExtractElementInst>(
+            BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
+        if (OtherEE) {
+          const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
+          if (!IdxOp)
+            return false;
+          return IsExtractLaneEquivalentToZero(
+              cast<ConstantInt>(OtherEE->getIndexOperand())
+                  ->getValue()
+                  .getZExtValue(),
+              OtherEE->getType()->getScalarSizeInBits());
+        }
+        return true;
+      });
+    }
+    return true;
+  };
+
+  if (std::holds_alternative<const unsigned>(InstOrOpcode)) {
+    const unsigned &Opcode = get<const unsigned>(InstOrOpcode);
+    if (Opcode == Instruction::ExtractElement && ExtractCanFuseWithFmul())
+      return 0;
+  } else if (I && I->getOpcode() == Instruction::ExtractElement &&
+             ExtractCanFuseWithFmul()) {
+    return 0;
+  }
+
   // All other insert/extracts cost this much.
   return ST->getVectorInsertExtractBaseCost();
 }
@@ -3207,6 +3362,14 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
 }
 
+InstructionCost AArch64TTIImpl::getVectorInstrCost(
+    unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+    Value *Scalar,
+    const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
+  return getVectorInstrCostHelper(Opcode, Val, Index, false, Scalar,
+                                  ScalarUserAndIdx);
+}
+
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 28e45207596ecd..cd0bb7af3a7ff9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -66,8 +66,11 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
   // indicates whether the vector instruction is available in the input IR or
   // just imaginary in vectorizer passes.
-  InstructionCost getVectorInstrCostHelper(const Instruction *I, Type *Val,
-                                           unsigned Index, bool HasRealUse);
+  InstructionCost getVectorInstrCostHelper(
+      std::variant<const Instruction *, const unsigned> InstOrOpcode, Type *Val,
+      unsigned Index, bool HasRealUse, Value *Scalar = nullptr,
+      const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx =
+          SmallVector<std::tuple<Value *, User *, int>, 0>());
 
 public:
   explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F)
@@ -185,6 +188,12 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index, Value *Op0, Value *Op1);
+
+  InstructionCost getVectorInstrCost(
+      unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
+      Value *Scalar,
+      const ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx);
+
   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
                                      TTI::TargetCostKind CostKind,
                                      unsigned Index);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 401597af35bdac..d69e2e22070492 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11633,6 +11633,13 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
   DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
   SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
+  // Keep track {Scalar, Index, User} tuple.
+  // On AArch64, this helps in fusing a mov instruction, associated with
+  // extractelement, with fmul in the backend so that extractelement is free.
+  SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
+  for (ExternalUser &EU : ExternalUses) {
+    ScalarUserAndIdx.emplace_back(std::make_tuple(EU.Scalar, EU.User, EU.Lane));
+  }
   for (ExternalUser &EU : ExternalUses) {
     // Uses by ephemeral values are free (because the ephemeral value will be
     // removed prior to code generation, and so the extraction will be
@@ -11739,8 +11746,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
       ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
                                                 VecTy, EU.Lane);
     } else {
-      ExtraCost = TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
-                                          CostKind, EU.Lane);
+      ExtraCost =
+          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
+                                  EU.Lane, EU.Scalar, ScalarUserAndIdx);
     }
     // Leave the scalar instructions as is if they are cheaper than extracts.
     if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
index dd3d0289bbb1cf..d2b75faa014d68 100644
--- a/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/extract_float.ll
@@ -8,7 +8,7 @@
 define double @extract_case1(<2 x double> %a) {
 ; CHECK-LABEL: 'extract_case1'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x double> %a, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
 entry:
@@ -57,7 +57,7 @@ entry:
 ; res = lane 1 * scalar
 define double @extract_case5(<2 x double> %a, double %b) {
 ; CHECK-LABEL: 'extract_case5'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <2 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x double> %a, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %b
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
 entry:
@@ -71,7 +71,7 @@ entry:
 define double @extract_case6(<3 x double> %a) {
 ; CHECK-LABEL: 'extract_case6'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <3 x double> %a, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <3 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <3 x double> %a, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
 entry:
@@ -86,7 +86,7 @@ entry:
 ; register. But for other register sizes, this is not the case.
 define double @extract_case7(<4 x double> %a) {
 ; CHECK-LABEL: 'extract_case7'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %0 = extractelement <4 x double> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <4 x double> %a, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <4 x double> %a, i32 2
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul double %0, %1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %res
@@ -158,11 +158,17 @@ entry:
 
 ; res = lane 0 * lane 1
 define half @extract_case11(<2 x half> %a) {
-; CHECK-LABEL: 'extract_case11'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret half %res
+; NOFP16-LABEL: 'extract_case11'
+; NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
+; NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x half> %a, i32 1
+; NOFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
+; NOFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret half %res
+;
+; FULLFP16-LABEL: 'extract_case11'
+; FULLFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x half> %a, i32 0
+; FULLFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x half> %a, i32 1
+; FULLFP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul half %0, %1
+; FULLFP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret half %res
 entry:
   %1 = extractelement <2 x half> %a, i32 0
   %2 = extractelement <2 x half> %a, i32 1
@@ -174,7 +180,7 @@ entry:
 define float @extract_case12(<2 x float> %a) {
 ; CHECK-LABEL: 'extract_case12'
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %0 = extractelement <2 x float> %a, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = extractelement <2 x float> %a, i32 1
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %1 = extractelement <2 x float> %a, i32 1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %res = fmul float %0, %1
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %res
 entry:
@@ -200,6 +206,3 @@ entry:
 }
 
 declare void @foo(double)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FULLFP16: {{.*}}
-; NOFP16: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
index 369ca28ece55b8..db24ccc2a5b346 100644
--- a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.9.0 | FileCheck %s --check-prefixes=CHECK-X86 %}
-; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=CHECK-AARCH64 %}
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=x86_64-apple-macosx10.9.0 | FileCheck %s --check-prefix=CHECK %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK %}
 
 @A = common global [2000 x double] zeroinitializer, align 16
 @B = common global [2000 x double] zeroinitializer, align 16
@@ -439,54 +439,28 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; Make sure we are able to vectorize this from now on:
 ;
 define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr #0 {
-; CHECK-X86-LABEL: @bar(
-; CHECK-X86-NEXT:  entry:
-; CHECK-X86-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-X86-NEXT:    br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
-; CHECK-X86:       for.cond.cleanup:
-; CHECK-X86-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-X86-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
-; CHECK-X86-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-X86-NEXT:    [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
-; CHECK-X86-NEXT:    ret double [[MUL]]
-; CHECK-X86:       for.body:
-; CHECK-X86-NEXT:    [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-X86-NEXT:    [[TMP3:%.*]] = phi <2 x double> [ [[TMP5]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
-; CHECK-X86-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
-; CHECK-X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]]
-; CHECK-X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
-; CHECK-X86-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP3]], [[TMP4]]
-; CHECK-X86-NEXT:    [[ADD5]] = add i32 [[I_018]], 2
-; CHECK-X86-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
-; CHECK-X86-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret double [[MUL]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x double> [ [[TMP5]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[ADD5]] = add i32 [[I_018]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
 ;
 
-; CHECK-AARCH64-LABEL: @bar(
-; CHECK-AARCH64-NEXT:  entry:
-; CHECK-AARCH64-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-AARCH64-NEXT:    br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
-; CHECK-AARCH64:       for.cond.cleanup:
-; CHECK-AARCH64-NEXT:    [[X_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-AARCH64-NEXT:    [[Y_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY]] ]
-; CHECK-AARCH64-NEXT:    [[MUL:%.*]] = fmul double [[X_0_LCSSA]], [[Y_0_LCSSA]]
-; CHECK-AARCH64-NEXT:    ret double [[MUL]]
-; CHECK-AARCH64:       for.body:
-; CHECK-AARCH64-NEXT:    [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-AARCH64-NEXT:    [[Y_017:%.*]] = phi double [ [[ADD4]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-AARCH64-NEXT:    [[X_016:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-AARCH64-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64
-; CHECK-AARCH64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[IDXPROM]]
-; CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-AARCH64-NEXT:    [[ADD]] = fadd double [[X_016]], [[TMP0]]
-; CHECK-AARCH64-NEXT:    [[ADD1:%.*]] = or disjoint i32 [[I_018]], 1
-; CHECK-AARCH64-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[ADD1]] to i64
-; CHECK-AARCH64-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[IDXPROM2]]
-; CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; CHECK-AARCH64-NEXT:    [[ADD4]] = fadd double [[Y_017]], [[TMP1]]
-; CHECK-AARCH64-NEXT:    [[ADD5]] = add i32 [[I_018]], 2
-; CHECK-AARCH64-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]]
-; CHECK-AARCH64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]]
-;
 
 entry:
   %cmp15 = icmp eq i32 %n, 0