[llvm] 0dcd2b4 - [TTI] Remove default condition type and predicate arguments from getCmpSelInstrCost

Wed Oct 6 07:40:48 PDT 2021

Author: Simon Pilgrim
Date: 2021-10-06T15:40:35+01:00
New Revision: 0dcd2b40e6879542d708fdb9dcfdcbcaffdc2ce7

URL: https://github.com/llvm/llvm-project/commit/0dcd2b40e6879542d708fdb9dcfdcbcaffdc2ce7
DIFF: https://github.com/llvm/llvm-project/commit/0dcd2b40e6879542d708fdb9dcfdcbcaffdc2ce7.diff

LOG: [TTI] Remove default condition type and predicate arguments from getCmpSelInstrCost

We need to be better at exposing the comparison predicate to getCmpSelInstrCost calls as some targets (e.g. X86 SSE) have very different costs for different comparisons (PR48337), and we can't always rely on the optional Instruction argument.

This initial commit requires explicit condition type and predicate arguments. The next step will be to review a lot of the existing getCmpSelInstrCost calls which have used BAD_ICMP_PREDICATE even when the predicate is known.

Differential Revision: https://reviews.llvm.org/D111024

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Transforms/Utils/LoopUtils.h
    llvm/lib/Transforms/Utils/LoopUtils.cpp
    llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
    llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
    llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
    llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9f68681251f6c..370ab30726848 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1103,8 +1103,8 @@ class TargetTransformInfo {
   /// is using a compare with the specified predicate as condition. When vector
   /// types are passed, \p VecPred must be used for all lanes.
   InstructionCost
-  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy = nullptr,
-                     CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE,
+  getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                     CmpInst::Predicate VecPred,
                      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
                      const Instruction *I = nullptr) const;
 

diff  --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index a425aa2a9fba1..22316eb754893 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -348,6 +348,9 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         SinkAndHoistLICMFlags *LICMFlags = nullptr,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Returns the comparison predicate used when expanding a min/max reduction.
+CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
+
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,

diff  --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b8f8ad579d58f..78d756a9d3552 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -889,32 +889,28 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
-Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
-                            Value *Right) {
-  CmpInst::Predicate Pred;
+CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
   switch (RK) {
   default:
     llvm_unreachable("Unknown min/max recurrence kind");
   case RecurKind::UMin:
-    Pred = CmpInst::ICMP_ULT;
-    break;
+    return CmpInst::ICMP_ULT;
   case RecurKind::UMax:
-    Pred = CmpInst::ICMP_UGT;
-    break;
+    return CmpInst::ICMP_UGT;
   case RecurKind::SMin:
-    Pred = CmpInst::ICMP_SLT;
-    break;
+    return CmpInst::ICMP_SLT;
   case RecurKind::SMax:
-    Pred = CmpInst::ICMP_SGT;
-    break;
+    return CmpInst::ICMP_SGT;
   case RecurKind::FMin:
-    Pred = CmpInst::FCMP_OLT;
-    break;
+    return CmpInst::FCMP_OLT;
   case RecurKind::FMax:
-    Pred = CmpInst::FCMP_OGT;
-    break;
+    return CmpInst::FCMP_OGT;
   }
+}
 
+Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
+                            Value *Right) {
+  CmpInst::Predicate Pred = getMinMaxReductionPredicate(RK);
   Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;

diff  --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 61a4cbec6e038..bdf3f44742bdd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8579,28 +8579,32 @@ class HorizontalReduction {
     }
     case RecurKind::FMax:
     case RecurKind::FMin: {
+      auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
       VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
                                                /*unsigned=*/false, CostKind);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+      CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+      ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind) +
+                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind);
       break;
     }
     case RecurKind::SMax:
     case RecurKind::SMin:
     case RecurKind::UMax:
     case RecurKind::UMin: {
+      auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
       bool IsUnsigned =
           RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
       VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned,
                                                CostKind);
-      ScalarCost =
-          TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
-          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
-                                  CmpInst::makeCmpResultType(ScalarTy));
+      CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
+      ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind) +
+                   TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                           SclCondTy, RdxPred, CostKind);
       break;
     }
     default:

diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 2038655051b37..cc634c56282ad 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -82,7 +82,7 @@ class VectorCombine {
                                         ExtractElementInst *Ext1,
                                         unsigned PreferredExtractIndex) const;
   bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
-                             unsigned Opcode,
+                             const Instruction &I,
                              ExtractElementInst *&ConvertToShuffle,
                              unsigned PreferredExtractIndex);
   void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
@@ -299,12 +299,13 @@ ExtractElementInst *VectorCombine::getShuffleExtract(
 /// \p ConvertToShuffle to that extract instruction.
 bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
                                           ExtractElementInst *Ext1,
-                                          unsigned Opcode,
+                                          const Instruction &I,
                                           ExtractElementInst *&ConvertToShuffle,
                                           unsigned PreferredExtractIndex) {
   assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
          isa<ConstantInt>(Ext1->getOperand(1)) &&
          "Expected constant extract indexes");
+  unsigned Opcode = I.getOpcode();
   Type *ScalarTy = Ext0->getType();
   auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
   InstructionCost ScalarOpCost, VectorOpCost;
@@ -317,10 +318,11 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
   } else {
     assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
            "Expected a compare");
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
-                                          CmpInst::makeCmpResultType(ScalarTy));
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
-                                          CmpInst::makeCmpResultType(VecTy));
+    CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
+    ScalarOpCost = TTI.getCmpSelInstrCost(
+        Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+    VectorOpCost = TTI.getCmpSelInstrCost(
+        Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
   }
 
   // Get cost estimates for the extract elements. These costs will factor into
@@ -495,8 +497,7 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
           m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
 
   ExtractElementInst *ExtractToChange;
-  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
-                            InsertIndex))
+  if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
     return false;
 
   if (ExtractToChange) {
@@ -640,8 +641,11 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
   unsigned Opcode = I.getOpcode();
   InstructionCost ScalarOpCost, VectorOpCost;
   if (IsCmp) {
-    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
-    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+    CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
+    ScalarOpCost = TTI.getCmpSelInstrCost(
+        Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred);
+    VectorOpCost = TTI.getCmpSelInstrCost(
+        Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred);
   } else {
     ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
     VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
@@ -741,7 +745,10 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   InstructionCost OldCost =
       TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
   OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
-  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+  OldCost +=
+      TTI.getCmpSelInstrCost(CmpOpcode, I0->getType(),
+                             CmpInst::makeCmpResultType(I0->getType()), Pred) *
+      2;
   OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
 
   // The proposed vector pattern is:
@@ -750,7 +757,8 @@ bool VectorCombine::foldExtractedCmps(Instruction &I) {
   int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
   int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
   auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
-  InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  InstructionCost NewCost = TTI.getCmpSelInstrCost(
+      CmpOpcode, X->getType(), CmpInst::makeCmpResultType(X->getType()), Pred);
   SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
   ShufMask[CheapIndex] = ExpensiveIndex;
   NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy,

diff  --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
index bb2734284c86a..73e52c13a465f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -1,15 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i1 @fcmp_and_v2f64(<2 x double> %a) {
-; CHECK-LABEL: @fcmp_and_v2f64(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @fcmp_and_v2f64(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    [[E2:%.*]] = extractelement <2 x double> [[A]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_and_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A:%.*]], <double 4.200000e+01, double -8.000000e+00>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <2 x double> %a, i32 0
   %e2 = extractelement <2 x double> %a, i32 1
@@ -20,13 +27,20 @@ define i1 @fcmp_and_v2f64(<2 x double> %a) {
 }
 
 define i1 @fcmp_or_v4f64(<4 x double> %a) {
-; CHECK-LABEL: @fcmp_or_v4f64(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @fcmp_or_v4f64(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
+; SSE-NEXT:    [[E2:%.*]] = extractelement <4 x double> [[A]], i64 2
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp olt double [[E1]], 4.200000e+01
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp olt double [[E2]], -8.000000e+00
+; SSE-NEXT:    [[R:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @fcmp_or_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x double> [[A:%.*]], <double 4.200000e+01, double undef, double -8.000000e+00, double undef>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = or <4 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <4 x double> %a, i32 0
   %e2 = extractelement <4 x double> %a, i64 2
@@ -38,11 +52,10 @@ define i1 @fcmp_or_v4f64(<4 x double> %a) {
 
 define i1 @icmp_xor_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: @icmp_xor_v4i32(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 3
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[E1]], 42
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[E2]], -8
-; CHECK-NEXT:    [[R:%.*]] = xor i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[A:%.*]], <i32 undef, i32 -8, i32 undef, i32 42>
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <4 x i32> %a, i32 3
@@ -56,13 +69,20 @@ define i1 @icmp_xor_v4i32(<4 x i32> %a) {
 ; add is not canonical (should be xor), but that is ok.
 
 define i1 @icmp_add_v8i32(<8 x i32> %a) {
-; CHECK-LABEL: @icmp_add_v8i32(
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7
-; CHECK-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[E1]], 42
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[E2]], -8
-; CHECK-NEXT:    [[R:%.*]] = add i1 [[CMP1]], [[CMP2]]
-; CHECK-NEXT:    ret i1 [[R]]
+; SSE-LABEL: @icmp_add_v8i32(
+; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 7
+; SSE-NEXT:    [[E2:%.*]] = extractelement <8 x i32> [[A]], i32 2
+; SSE-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[E1]], 42
+; SSE-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[E2]], -8
+; SSE-NEXT:    [[R:%.*]] = add i1 [[CMP1]], [[CMP2]]
+; SSE-NEXT:    ret i1 [[R]]
+;
+; AVX-LABEL: @icmp_add_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i32> [[A:%.*]], <i32 undef, i32 undef, i32 -8, i32 undef, i32 undef, i32 undef, i32 undef, i32 42>
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> poison, <8 x i32> <i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = add <8 x i1> [[TMP1]], [[SHIFT]]
+; AVX-NEXT:    [[R:%.*]] = extractelement <8 x i1> [[TMP2]], i64 2
+; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <8 x i32> %a, i32 7
   %e2 = extractelement <8 x i32> %a, i32 2

diff  --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
index 5bc70aeabfbcf..a1a1f41055e38 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
@@ -1,30 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 define i1 @cmp_v4i32(<4 x float> %arg, <4 x float> %arg1) {
 ; CHECK-LABEL: @cmp_v4i32(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[T:%.*]] = bitcast <4 x float> [[ARG:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[T2:%.*]] = extractelement <4 x i32> [[T]], i32 0
 ; CHECK-NEXT:    [[T3:%.*]] = bitcast <4 x float> [[ARG1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <4 x i32> [[T3]], i32 0
-; CHECK-NEXT:    [[T5:%.*]] = icmp eq i32 [[T2]], [[T4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T5:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[T5]], label [[BB6:%.*]], label [[BB18:%.*]]
 ; CHECK:       bb6:
-; CHECK-NEXT:    [[T7:%.*]] = extractelement <4 x i32> [[T]], i32 1
-; CHECK-NEXT:    [[T8:%.*]] = extractelement <4 x i32> [[T3]], i32 1
-; CHECK-NEXT:    [[T9:%.*]] = icmp eq i32 [[T7]], [[T8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T9:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[T9]], label [[BB10:%.*]], label [[BB18]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[T11:%.*]] = extractelement <4 x i32> [[T]], i32 2
-; CHECK-NEXT:    [[T12:%.*]] = extractelement <4 x i32> [[T3]], i32 2
-; CHECK-NEXT:    [[T13:%.*]] = icmp eq i32 [[T11]], [[T12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T13:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
 ; CHECK-NEXT:    br i1 [[T13]], label [[BB14:%.*]], label [[BB18]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[T15:%.*]] = extractelement <4 x i32> [[T]], i32 3
-; CHECK-NEXT:    [[T16:%.*]] = extractelement <4 x i32> [[T3]], i32 3
-; CHECK-NEXT:    [[T17:%.*]] = icmp eq i32 [[T15]], [[T16]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[T]], [[T3]]
+; CHECK-NEXT:    [[T17:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
 ; CHECK-NEXT:    br label [[BB18]]
 ; CHECK:       bb18:
 ; CHECK-NEXT:    [[T19:%.*]] = phi i1 [ false, [[BB10]] ], [ false, [[BB6]] ], [ false, [[BB:%.*]] ], [ [[T17]], [[BB14]] ]
@@ -62,19 +58,32 @@ bb18:
 }
 
 define i32 @cmp_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) {
-; CHECK-LABEL: @cmp_v2f64(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]]
-; CHECK-NEXT:    br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
-; CHECK:       t:
-; CHECK-NEXT:    [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1
-; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]]
-; CHECK-NEXT:    [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
-; CHECK-NEXT:    ret i32 [[E]]
-; CHECK:       f:
-; CHECK-NEXT:    ret i32 0
+; SSE-LABEL: @cmp_v2f64(
+; SSE-NEXT:  entry:
+; SSE-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; SSE-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
+; SSE-NEXT:    [[CMP1:%.*]] = fcmp oeq double [[X1]], [[Y1]]
+; SSE-NEXT:    br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
+; SSE:       t:
+; SSE-NEXT:    [[Z1:%.*]] = extractelement <2 x double> [[Z:%.*]], i32 1
+; SSE-NEXT:    [[CMP2:%.*]] = fcmp ogt double [[Y1]], [[Z1]]
+; SSE-NEXT:    [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
+; SSE-NEXT:    ret i32 [[E]]
+; SSE:       f:
+; SSE-NEXT:    ret i32 0
+;
+; AVX-LABEL: @cmp_v2f64(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = fcmp oeq <2 x double> [[X:%.*]], [[Y:%.*]]
+; AVX-NEXT:    [[CMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
+; AVX-NEXT:    br i1 [[CMP1]], label [[T:%.*]], label [[F:%.*]]
+; AVX:       t:
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp ogt <2 x double> [[Y]], [[Z:%.*]]
+; AVX-NEXT:    [[CMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; AVX-NEXT:    [[E:%.*]] = select i1 [[CMP2]], i32 42, i32 99
+; AVX-NEXT:    ret i32 [[E]]
+; AVX:       f:
+; AVX-NEXT:    ret i32 0
 ;
 entry:
   %x1 = extractelement <2 x double> %x, i32 1
@@ -93,11 +102,17 @@ f:
 }
 
 define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
-; CHECK-LABEL: @cmp01_v2f64(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; SSE-LABEL: @cmp01_v2f64(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
+; SSE-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
+; SSE-NEXT:    [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
+; SSE-NEXT:    ret i1 [[CMP]]
+;
+; AVX-LABEL: @cmp01_v2f64(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp oge <2 x double> [[X:%.*]], [[SHIFT]]
+; AVX-NEXT:    [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; AVX-NEXT:    ret i1 [[CMP]]
 ;
   %x0 = extractelement <2 x double> %x, i32 0
   %y1 = extractelement <2 x double> %y, i32 1
@@ -106,11 +121,17 @@ define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
 }
 
 define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
-; CHECK-LABEL: @cmp10_v2f64(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; SSE-LABEL: @cmp10_v2f64(
+; SSE-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; SSE-NEXT:    [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
+; SSE-NEXT:    [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
+; SSE-NEXT:    ret i1 [[CMP]]
+;
+; AVX-LABEL: @cmp10_v2f64(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp ule <2 x double> [[SHIFT]], [[Y:%.*]]
+; AVX-NEXT:    [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0
+; AVX-NEXT:    ret i1 [[CMP]]
 ;
   %x1 = extractelement <2 x double> %x, i32 1
   %y0 = extractelement <2 x double> %y, i32 0
@@ -120,9 +141,9 @@ define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
 
 define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @cmp12_v4i32(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X1]], [[Y2]]
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> poison, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[CMP:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %x1 = extractelement <4 x i32> %x, i32 1
@@ -132,12 +153,19 @@ define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
 }
 
 define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) {
-; CHECK-LABEL: @ins_fcmp_ext_ext(
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1
-; CHECK-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
-; CHECK-NEXT:    [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
-; CHECK-NEXT:    ret <4 x i1> [[R]]
+; SSE-LABEL: @ins_fcmp_ext_ext(
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SSE-NEXT:    [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]]
+; SSE-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
+; SSE-NEXT:    ret <4 x i1> [[R]]
+;
+; AVX-LABEL: @ins_fcmp_ext_ext(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 1, i32 undef>
+; AVX-NEXT:    [[TMP1:%.*]] = fcmp ugt <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[A21:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
+; AVX-NEXT:    ret <4 x i1> [[R]]
 ;
   %a1 = extractelement <4 x float> %a, i32 1
   %a2 = extractelement <4 x float> %a, i32 2

diff  --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
index c82c42d8d3eae..a9214aa26e767 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
@@ -132,8 +132,7 @@ define <4 x i1> @ins2_ins2_f32_uses(float %x, float %y) {
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i32 2
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp oeq float [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> poison, i1 [[R_SCALAR]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]]
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %i0 = insertelement <4 x float> poison, float %x, i32 2

diff  --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
index 3d32113771457..85f521e7d0d14 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
@@ -132,8 +132,7 @@ define <4 x i1> @ins2_ins2_f32_uses(float %x, float %y) {
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I0]])
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x float> undef, float [[Y:%.*]], i32 2
 ; CHECK-NEXT:    call void @usef(<4 x float> [[I1]])
-; CHECK-NEXT:    [[R_SCALAR:%.*]] = fcmp oeq float [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i1> zeroinitializer, i1 [[R_SCALAR]], i64 2
+; CHECK-NEXT:    [[R:%.*]] = fcmp oeq <4 x float> [[I0]], [[I1]]
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %i0 = insertelement <4 x float> undef, float %x, i32 2