[llvm] a69158c - [VectorCombine] fold extract-extract-op with different extraction indexes

Sun Mar 8 07:00:36 PDT 2020

Author: Sanjay Patel
Date: 2020-03-08T09:57:55-04:00
New Revision: a69158c12acd635ee4bcb22b1060d92b29483042

URL: https://github.com/llvm/llvm-project/commit/a69158c12acd635ee4bcb22b1060d92b29483042
DIFF: https://github.com/llvm/llvm-project/commit/a69158c12acd635ee4bcb22b1060d92b29483042.diff

LOG: [VectorCombine] fold extract-extract-op with different extraction indexes

opcode (extelt V0, Ext0), (ext V1, Ext1) --> extelt (opcode (splat V0, Ext0), V1), Ext1

The first part of this patch generalizes the cost calculation to accept
different extraction indexes. The second part creates a shuffle+extract
before feeding into the existing code to create a vector op+extract.

The patch conservatively uses "TargetTransformInfo::SK_PermuteSingleSrc"
rather than "TargetTransformInfo::SK_Broadcast" (splat specifically
from element 0) because we do not have a more general "SK_Splat"
currently. That does not affect any of the current regression tests,
but we might be able to find some cost model target specialization where
that comes into play.

I suspect that we can expose some missing x86 horizontal op codegen with
this transform, so I'm speculatively adding a debug flag to disable the
binop variant of this transform to allow easier testing.

The test changes show that we're sensitive to cost model diffs (as we
should be), so that means that patches like D74976
should have better coverage.

Differential Revision: https://reviews.llvm.org/D75689

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VectorCombine.cpp
    llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
    llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index ed7415d45c31..ba3ff87884b0 100644

--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -38,19 +38,23 @@ static cl::opt<bool> DisableVectorCombine(
     "disable-vector-combine", cl::init(false), cl::Hidden,
     cl::desc("Disable all vector combine transforms"));
 
-/// Compare the relative costs of extracts followed by scalar operation vs.
-/// vector operation followed by extract:
-/// opcode (extelt V0, C), (extelt V1, C) --> extelt (opcode V0, V1), C
-/// Unless the vector op is much more expensive than the scalar op, this
-/// eliminates an extract.
+static cl::opt<bool> DisableBinopExtractShuffle(
+    "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
+    cl::desc("Disable binop extract to shuffle transforms"));
+
+
+/// Compare the relative costs of 2 extracts followed by scalar operation vs.
+/// vector operation(s) followed by extract. Return true if the existing
+/// instructions are cheaper than a vector alternative. Otherwise, return false
+/// and if one of the extracts should be transformed to a shufflevector, set
+/// \p ConvertToShuffle to that extract instruction.
 static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
                                   unsigned Opcode,
-                                  const TargetTransformInfo &TTI) {
+                                  const TargetTransformInfo &TTI,
+                                  Instruction *&ConvertToShuffle) {
   assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
-         (cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue() ==
-          cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue()) &&
-         "Expected same constant extract index");
-
+         isa<ConstantInt>(Ext1->getOperand(1)) &&
+         "Expected constant extract indexes");
   Type *ScalarTy = Ext0->getType();
   Type *VecTy = Ext0->getOperand(0)->getType();
   int ScalarOpCost, VectorOpCost;
@@ -69,31 +73,73 @@ static bool isExtractExtractCheap(Instruction *Ext0, Instruction *Ext1,
                                           CmpInst::makeCmpResultType(VecTy));
   }
 
-  // Get cost estimate for the extract element. This cost will factor into
+  // Get cost estimates for the extract elements. These costs will factor into
   // both sequences.
-  unsigned ExtIndex = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
-  int ExtractCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
-                                           VecTy, ExtIndex);
+  unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
+  unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+
+  int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                            VecTy, Ext0Index);
+  int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+                                            VecTy, Ext1Index);
+
+  // A more expensive extract will always be replaced by a splat shuffle.
+  // For example, if Ext0 is more expensive:
+  // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
+  // extelt (opcode (splat V0, Ext0), V1), Ext1
+  // TODO: Evaluate whether that always results in lowest cost. Alternatively,
+  //       check the cost of creating a broadcast shuffle and shuffling both
+  //       operands to element 0.
+  int CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
 
   // Extra uses of the extracts mean that we include those costs in the
   // vector total because those instructions will not be eliminated.
   int OldCost, NewCost;
-  if (Ext0->getOperand(0) == Ext1->getOperand(0)) {
-    // Handle a special case. If the 2 operands are identical, adjust the
+  if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
+    // Handle a special case. If the 2 extracts are identical, adjust the
     // formulas to account for that. The extra use charge allows for either the
     // CSE'd pattern or an unoptimized form with identical values:
     // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
     bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
                                   : !Ext0->hasOneUse() || !Ext1->hasOneUse();
-    OldCost = ExtractCost + ScalarOpCost;
-    NewCost = VectorOpCost + ExtractCost + HasUseTax * ExtractCost;
+    OldCost = CheapExtractCost + ScalarOpCost;
+    NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
   } else {
     // Handle the general case. Each extract is actually a 
diff erent value:
-    // opcode (extelt V0, C), (extelt V1, C) --> extelt (opcode V0, V1), C
-    OldCost = 2 * ExtractCost + ScalarOpCost;
-    NewCost = VectorOpCost + ExtractCost + !Ext0->hasOneUse() * ExtractCost +
-              !Ext1->hasOneUse() * ExtractCost;
+    // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
+    OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
+    NewCost = VectorOpCost + CheapExtractCost +
+              !Ext0->hasOneUse() * Extract0Cost +
+              !Ext1->hasOneUse() * Extract1Cost;
   }
+
+  if (Ext0Index == Ext1Index) {
+    // If the extract indexes are identical, no shuffle is needed.
+    ConvertToShuffle = nullptr;
+  } else {
+    if (IsBinOp && DisableBinopExtractShuffle)
+      return true;
+
+    // If we are extracting from 2 
diff erent indexes, then one operand must be
+    // shuffled before performing the vector operation. The shuffle mask is
+    // undefined except for 1 lane that is being translated to the remaining
+    // extraction lane. Therefore, it is a splat shuffle. Ex:
+    // ShufMask = { undef, undef, 0, undef }
+    // TODO: The cost model has an option for a "broadcast" shuffle
+    //       (splat-from-element-0), but no option for a more general splat.
+    NewCost +=
+        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+
+    // The more expensive extract will be replaced by a shuffle. If the extracts
+    // have the same cost, replace the extract with the higher index.
+    if (Extract0Cost > Extract1Cost)
+      ConvertToShuffle = Ext0;
+    else if (Extract1Cost > Extract0Cost)
+      ConvertToShuffle = Ext1;
+    else
+      ConvertToShuffle = Ext0Index > Ext1Index ? Ext0 : Ext1;
+  }
+
   // Aggressively form a vector op if the cost is equal because the transform
   // may enable further optimization.
   // Codegen can reverse this transform (scalarize) if it was not profitable.
@@ -162,12 +208,33 @@ static bool foldExtractExtract(Instruction &I, const TargetTransformInfo &TTI) {
       V0->getType() != V1->getType())
     return false;
 
-  // TODO: Handle C0 != C1 by shuffling 1 of the operands.
-  if (C0 != C1)
+  Instruction *ConvertToShuffle;
+  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI, ConvertToShuffle))
     return false;
 
-  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), TTI))
-    return false;
+  if (ConvertToShuffle) {
+    // The shuffle mask is undefined except for 1 lane that is being translated
+    // to the cheap extraction lane. Example:
+    // ShufMask = { 2, undef, undef, undef }
+    uint64_t SplatIndex = ConvertToShuffle == Ext0 ? C0 : C1;
+    uint64_t CheapExtIndex = ConvertToShuffle == Ext0 ? C1 : C0;
+    Type *VecTy = V0->getType();
+    Type *I32Ty = IntegerType::getInt32Ty(I.getContext());
+    UndefValue *Undef = UndefValue::get(I32Ty);
+    SmallVector<Constant *, 32> ShufMask(VecTy->getVectorNumElements(), Undef);
+    ShufMask[CheapExtIndex] = ConstantInt::get(I32Ty, SplatIndex);
+    IRBuilder<> Builder(ConvertToShuffle);
+
+    // extelt X, C --> extelt (splat X), C'
+    Value *Shuf = Builder.CreateShuffleVector(ConvertToShuffle->getOperand(0),
+                                              UndefValue::get(VecTy),
+                                              ConstantVector::get(ShufMask));
+    Value *NewExt = Builder.CreateExtractElement(Shuf, CheapExtIndex);
+    if (ConvertToShuffle == Ext0)
+      Ext0 = cast<Instruction>(NewExt);
+    else
+      Ext1 = cast<Instruction>(NewExt);
+  }
 
   if (Pred != CmpInst::BAD_ICMP_PREDICATE)
     foldExtExtCmp(Ext0, Ext1, I, TTI);

diff  --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 7edc636acbcb..56635bdff8c5 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -251,14 +251,18 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
   ret i8 %r
 }
 
-; TODO: Different extract indexes requires a shuffle.
-
 define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: @ext0_ext1_add(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; CHECK-NEXT:    ret i8 [[R]]
+; SSE-LABEL: @ext0_ext1_add(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
+; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
+; SSE-NEXT:    [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
+; SSE-NEXT:    ret i8 [[R]]
+;
+; AVX-LABEL: @ext0_ext1_add(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = add nuw <16 x i8> [[X:%.*]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
+; AVX-NEXT:    ret i8 [[TMP3]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 0
   %e1 = extractelement <16 x i8> %y, i32 1
@@ -267,11 +271,17 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: @ext5_ext0_add(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; CHECK-NEXT:    ret i8 [[R]]
+; SSE-LABEL: @ext5_ext0_add(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
+; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
+; SSE-NEXT:    [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
+; SSE-NEXT:    ret i8 [[R]]
+;
+; AVX-LABEL: @ext5_ext0_add(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> undef, <16 x i32> <i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = sub nsw <16 x i8> [[TMP1]], [[Y:%.*]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i64 0
+; AVX-NEXT:    ret i8 [[TMP3]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 5
   %e1 = extractelement <16 x i8> %y, i32 0
@@ -280,11 +290,17 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; CHECK-LABEL: @ext1_ext6_add(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[E0]], [[E1]]
-; CHECK-NEXT:    ret i8 [[R]]
+; SSE-LABEL: @ext1_ext6_add(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
+; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
+; SSE-NEXT:    [[R:%.*]] = and i8 [[E0]], [[E1]]
+; SSE-NEXT:    ret i8 [[R]]
+;
+; AVX-LABEL: @ext1_ext6_add(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> undef, <16 x i32> <i32 undef, i32 6, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = and <16 x i8> [[X:%.*]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1
+; AVX-NEXT:    ret i8 [[TMP3]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 1
   %e1 = extractelement <16 x i8> %y, i32 6
@@ -294,10 +310,10 @@ define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
 
 define float @ext1_ext0_fmul(<4 x float> %x) {
 ; CHECK-LABEL: @ext1_ext0_fmul(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x float> [[X]], i32 0
-; CHECK-NEXT:    [[R:%.*]] = fmul float [[E0]], [[E1]]
-; CHECK-NEXT:    ret float [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x float> [[TMP1]], [[X]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i64 0
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %e0 = extractelement <4 x float> %x, i32 1
   %e1 = extractelement <4 x float> %x, i32 0
@@ -309,9 +325,10 @@ define float @ext0_ext3_fmul_extra_use1(<4 x float> %x) {
 ; CHECK-LABEL: @ext0_ext3_fmul_extra_use1(
 ; CHECK-NEXT:    [[E0:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
 ; CHECK-NEXT:    call void @use_f32(float [[E0]])
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <4 x float> [[X]], i32 3
-; CHECK-NEXT:    [[R:%.*]] = fmul nnan float [[E0]], [[E1]]
-; CHECK-NEXT:    ret float [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul nnan <4 x float> [[X]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %e0 = extractelement <4 x float> %x, i32 0
   call void @use_f32(float %e0)
@@ -336,11 +353,17 @@ define float @ext0_ext3_fmul_extra_use2(<4 x float> %x) {
 }
 
 define float @ext0_ext4_fmul_v8f32(<8 x float> %x) {
-; CHECK-LABEL: @ext0_ext4_fmul_v8f32(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
-; CHECK-NEXT:    ret float [[R]]
+; SSE-LABEL: @ext0_ext4_fmul_v8f32(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 0
+; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
+; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
+; SSE-NEXT:    ret float [[R]]
+;
+; AVX-LABEL: @ext0_ext4_fmul_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> undef, <8 x i32> <i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[X]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 0
+; AVX-NEXT:    ret float [[TMP3]]
 ;
   %e0 = extractelement <8 x float> %x, i32 0
   %e1 = extractelement <8 x float> %x, i32 4
@@ -349,11 +372,17 @@ define float @ext0_ext4_fmul_v8f32(<8 x float> %x) {
 }
 
 define float @ext7_ext4_fmul_v8f32(<8 x float> %x) {
-; CHECK-LABEL: @ext7_ext4_fmul_v8f32(
-; CHECK-NEXT:    [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; CHECK-NEXT:    [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
-; CHECK-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
-; CHECK-NEXT:    ret float [[R]]
+; SSE-LABEL: @ext7_ext4_fmul_v8f32(
+; SSE-NEXT:    [[E0:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; SSE-NEXT:    [[E1:%.*]] = extractelement <8 x float> [[X]], i32 4
+; SSE-NEXT:    [[R:%.*]] = fadd float [[E0]], [[E1]]
+; SSE-NEXT:    ret float [[R]]
+;
+; AVX-LABEL: @ext7_ext4_fmul_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef, i32 undef, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[TMP1]], [[X]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i64 4
+; AVX-NEXT:    ret float [[TMP3]]
 ;
   %e0 = extractelement <8 x float> %x, i32 7
   %e1 = extractelement <8 x float> %x, i32 4

diff  --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
index bfbd79a72be4..65d886c7bca9 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
@@ -102,11 +102,17 @@ f:
 }
 
 define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
-; CHECK-LABEL: @cmp01_v2f64(
-; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
-; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; SSE-LABEL: @cmp01_v2f64(
+; SSE-NEXT:    [[X0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
+; SSE-NEXT:    [[Y1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
+; SSE-NEXT:    [[CMP:%.*]] = fcmp oge double [[X0]], [[Y1]]
+; SSE-NEXT:    ret i1 [[CMP]]
+;
+; AVX-LABEL: @cmp01_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = fcmp oge <2 x double> [[X:%.*]], [[TMP1]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; AVX-NEXT:    ret i1 [[TMP3]]
 ;
   %x0 = extractelement <2 x double> %x, i32 0
   %y1 = extractelement <2 x double> %y, i32 1
@@ -115,11 +121,17 @@ define i1 @cmp01_v2f64(<2 x double> %x, <2 x double> %y) {
 }
 
 define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
-; CHECK-LABEL: @cmp10_v2f64(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; SSE-LABEL: @cmp10_v2f64(
+; SSE-NEXT:    [[X1:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; SSE-NEXT:    [[Y0:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 0
+; SSE-NEXT:    [[CMP:%.*]] = fcmp ule double [[X1]], [[Y0]]
+; SSE-NEXT:    ret i1 [[CMP]]
+;
+; AVX-LABEL: @cmp10_v2f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+; AVX-NEXT:    [[TMP2:%.*]] = fcmp ule <2 x double> [[TMP1]], [[Y:%.*]]
+; AVX-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    ret i1 [[TMP3]]
 ;
   %x1 = extractelement <2 x double> %x, i32 1
   %y0 = extractelement <2 x double> %y, i32 0
@@ -129,10 +141,10 @@ define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) {
 
 define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: @cmp12_v4i32(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i32 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X1]], [[Y2]]
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    ret i1 [[TMP3]]
 ;
   %x1 = extractelement <4 x i32> %x, i32 1
   %y2 = extractelement <4 x i32> %y, i32 2