[llvm] r218300 - [x86] Teach the AVX1 path of the new vector shuffle lowering one more

Tue Sep 23 03:08:29 PDT 2014

Author: chandlerc
Date: Tue Sep 23 05:08:29 2014
New Revision: 218300

URL: http://llvm.org/viewvc/llvm-project?rev=218300&view=rev
Log:
[x86] Teach the AVX1 path of the new vector shuffle lowering one more
trick that I missed.

VPERMILPS has a non-immediate memory operand mode that allows it to do
asymetric shuffles in the two 128-bit lanes. Use this rather than two
shuffles and a blend.

However, it turns out the variable shuffle path to VPERMILPS (and
VPERMILPD, although that one offers no functional differenc from the
immediate operand other than variability) wasn't even plumbed through
codegen. Do such plumbing so that we can reasonably emit
a variable-masked VPERMILP instruction. Also plumb basic comment parsing
and printing through so that the tests are reasonable.

There are still a few tests which don't show the shuffle pattern. These
are tests with undef lanes. I'll teach the shuffle decoding and printing
to handle undef mask entries in a follow-up. I've looked at the masks
and they seem reasonable.

Modified:
    llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
    llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.h
    llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/lib/Target/X86/X86MCInstLower.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll

Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp Tue Sep 23 05:08:29 2014
@@ -287,4 +287,27 @@ void DecodeVPERMMask(unsigned Imm, Small
   }
 }
 
+void DecodeVPERMILPMask(const ConstantDataSequential *C,
+                        SmallVectorImpl<int> &ShuffleMask) {
+  Type *MaskTy = C->getType();
+  assert(MaskTy->isVectorTy() && "Expected a vector constant mask!");
+  assert(MaskTy->getVectorElementType()->isIntegerTy() &&
+         "Expected integer constant mask elements!");
+  int ElementBits = MaskTy->getScalarSizeInBits();
+  int NumElements = MaskTy->getVectorNumElements();
+  assert((NumElements == 2 || NumElements == 4 || NumElements == 8) &&
+         "Unexpected number of vector elements.");
+  assert((unsigned)NumElements == C->getNumElements() &&
+         "Constant mask has a different number of elements!");
+
+  ShuffleMask.reserve(NumElements);
+  for (int i = 0; i < NumElements; ++i) {
+    int Base = (i * ElementBits / 128) * (128 / ElementBits);
+    uint64_t Element = C->getElementAsInteger(i);
+    // Only the least significant 2 bits of the integer are used.
+    int Index = Base + (Element & 0x3);
+    ShuffleMask.push_back(Index);
+  }
+}
+
 } // llvm namespace

Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h Tue Sep 23 05:08:29 2014
@@ -84,6 +84,10 @@ void DecodeVPERM2X128Mask(MVT VT, unsign
 /// No VT provided since it only works on 256-bit, 4 element vectors.
 void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
 
+/// \brief Decode a VPERMILP variable mask from an IR-level vector constant.
+void DecodeVPERMILPMask(const ConstantDataSequential *C,
+                        SmallVectorImpl<int> &ShuffleMask);
+
 } // llvm namespace
 
 #endif

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Sep 23 05:08:29 2014
@@ -9395,26 +9395,15 @@ static SDValue lowerV8F32VectorShuffle(S
   }
 
   // If we have a single input shuffle with different shuffle patterns in the
-  // two 128-bit lanes, just do two shuffles and blend them together. This will
-  // be faster than extracting the high 128-bit lane, shuffling it, and
-  // re-inserting it. Especially on newer processors where blending is *the*
-  // fastest operation.
+  // two 128-bit lanes use the variable mask to VPERMILPS.
   if (isSingleInputShuffleMask(Mask)) {
-    int LoMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
-    int HiMask[4] = {Mask[4], Mask[5], Mask[6], Mask[7]};
-    for (int &M : HiMask)
-      if (M >= 0)
-        M -= 4;
-    SDValue Lo = V1, Hi = V1;
-    if (!isNoopShuffleMask(LoMask))
-      Lo = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Lo,
-                       getV4X86ShuffleImm8ForMask(LoMask, DAG));
-    if (!isNoopShuffleMask(HiMask))
-      Hi = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Hi,
-                       getV4X86ShuffleImm8ForMask(HiMask, DAG));
-    unsigned BlendMask = 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7;
-    return DAG.getNode(X86ISD::BLENDI, DL, MVT::v8f32, Lo, Hi,
-                       DAG.getConstant(BlendMask, MVT::i8));
+    SDValue VPermMask[8];
+    for (int i = 0; i < 8; ++i)
+      VPermMask[i] = Mask[i] < 0 ? DAG.getUNDEF(MVT::i32)
+                                 : DAG.getConstant(Mask[i], MVT::i32);
+    return DAG.getNode(
+        X86ISD::VPERMILPV, DL, MVT::v8f32, V1,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i32, VPermMask));
   }
 
   // Shuffle the input elements into the desired positions in V1 and V2 and

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Tue Sep 23 05:08:29 2014
@@ -343,6 +343,7 @@ namespace llvm {
       MOVSS,
       UNPCKL,
       UNPCKH,
+      VPERMILPV,
       VPERMILPI,
       VPERMV,
       VPERMV3,

Modified: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td Tue Sep 23 05:08:29 2014
@@ -188,6 +188,8 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [S
 def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
+def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                        SDTCisVec<2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
@@ -232,6 +234,7 @@ def X86Packus : SDNode<"X86ISD::PACKUS",
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
+def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
 def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
 def X86VPermv     : SDNode<"X86ISD::VPERMV",    SDTShuff2Op>;
 def X86VPermi     : SDNode<"X86ISD::VPERMI",    SDTShuff2OpI>;

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Tue Sep 23 05:08:29 2014
@@ -8418,6 +8418,15 @@ let ExeDomain = SSEPackedDouble in {
 }
 
 let Predicates = [HasAVX] in {
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
+          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
+          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
+          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
+def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
+          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
+
 def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
 def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
@@ -8428,6 +8437,15 @@ def : Pat<(v8i32 (X86VPermilpi (bc_v8i32
 def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
+          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
+          (VPERMILPSrm VR128:$src1, addr:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
+          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
+          (VPERMILPDrm VR128:$src1, addr:$src2)>;
+
 def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
           (VPERMILPDri VR128:$src1, imm:$imm)>;
 def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),

Modified: llvm/trunk/lib/Target/X86/X86MCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86MCInstLower.cpp?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86MCInstLower.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86MCInstLower.cpp Tue Sep 23 05:08:29 2014
@@ -1022,15 +1022,19 @@ void X86AsmPrinter::EmitInstruction(cons
 
   case X86::PSHUFBrm:
   case X86::VPSHUFBrm:
-    // Lower PSHUFB normally but add a comment if we can find a constant
-    // shuffle mask. We won't be able to do this at the MC layer because the
-    // mask isn't an immediate.
+  case X86::VPERMILPSrm:
+  case X86::VPERMILPDrm:
+  case X86::VPERMILPSYrm:
+  case X86::VPERMILPDYrm:
+    // Lower PSHUFB and VPERMILP normally but add a comment if we can find
+    // a constant shuffle mask. We won't be able to do this at the MC layer
+    // because the mask isn't an immediate.
     std::string Comment;
     raw_string_ostream CS(Comment);
     SmallVector<int, 16> Mask;
 
-    assert(MI->getNumOperands() >= 6 &&
-           "Wrong number of operands for PSHUFBrm or VPSHUFBrm");
+    // All of these instructions accept a constant pool operand as their fifth.
+    assert(MI->getNumOperands() > 5 && "We should always have at least 5 operands!");
     const MachineOperand &DstOp = MI->getOperand(0);
     const MachineOperand &SrcOp = MI->getOperand(1);
     const MachineOperand &MaskOp = MI->getOperand(5);
@@ -1061,7 +1065,18 @@ void X86AsmPrinter::EmitInstruction(cons
           assert(MaskTy == C->getType() &&
                  "Expected a constant of the same type!");
 
-          DecodePSHUFBMask(C, Mask);
+          switch (MI->getOpcode()) {
+          case X86::PSHUFBrm:
+          case X86::VPSHUFBrm:
+            DecodePSHUFBMask(C, Mask);
+            break;
+          case X86::VPERMILPSrm:
+          case X86::VPERMILPDrm:
+          case X86::VPERMILPSYrm:
+          case X86::VPERMILPDYrm:
+            DecodeVPERMILPMask(C, Mask);
+          }
+
           assert(Mask.size() == MaskTy->getVectorNumElements() &&
                  "Shuffle mask has a different size than its type!");
         }

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll?rev=218300&r1=218299&r2=218300&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll Tue Sep 23 05:08:29 2014
@@ -381,9 +381,7 @@ define <8 x float> @shuffle_v8f32_102254
 define <8 x float> @shuffle_v8f32_00015444(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00015444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,4,4,4,5]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,5,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -392,9 +390,7 @@ define <8 x float> @shuffle_v8f32_000154
 define <8 x float> @shuffle_v8f32_00204644(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00204644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,2,0,0,4,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -403,9 +399,7 @@ define <8 x float> @shuffle_v8f32_002046
 define <8 x float> @shuffle_v8f32_03004474(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_03004474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,3,0,4,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,7,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x float> %shuffle
@@ -414,9 +408,7 @@ define <8 x float> @shuffle_v8f32_030044
 define <8 x float> @shuffle_v8f32_10004444(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_10004444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,4,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -425,9 +417,7 @@ define <8 x float> @shuffle_v8f32_100044
 define <8 x float> @shuffle_v8f32_22006446(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_22006446
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,0,0,2,6,4,4,6]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,4,4,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x float> %shuffle
@@ -436,9 +426,7 @@ define <8 x float> @shuffle_v8f32_220064
 define <8 x float> @shuffle_v8f32_33307474(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_33307474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,0,3,0,7,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,7,7,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x float> %shuffle
@@ -447,8 +435,7 @@ define <8 x float> @shuffle_v8f32_333074
 define <8 x float> @shuffle_v8f32_32104567(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_32104567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,2,1,0,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -457,9 +444,7 @@ define <8 x float> @shuffle_v8f32_321045
 define <8 x float> @shuffle_v8f32_00236744(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00236744
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,3,0,0,6,7,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,6,7,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -468,9 +453,7 @@ define <8 x float> @shuffle_v8f32_002367
 define <8 x float> @shuffle_v8f32_00226644(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00226644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,6,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -479,8 +462,7 @@ define <8 x float> @shuffle_v8f32_002266
 define <8 x float> @shuffle_v8f32_10324567(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_10324567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,3,2,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -489,8 +471,7 @@ define <8 x float> @shuffle_v8f32_103245
 define <8 x float> @shuffle_v8f32_11334567(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_11334567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,1,3,3,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -499,8 +480,7 @@ define <8 x float> @shuffle_v8f32_113345
 define <8 x float> @shuffle_v8f32_01235467(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_01235467
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -509,8 +489,7 @@ define <8 x float> @shuffle_v8f32_012354
 define <8 x float> @shuffle_v8f32_01235466(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_01235466
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,2,5,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x float> %shuffle
@@ -519,9 +498,7 @@ define <8 x float> @shuffle_v8f32_012354
 define <8 x float> @shuffle_v8f32_002u6u44(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_002u6u44
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,1,0,0,6,5,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
   ret <8 x float> %shuffle
@@ -530,9 +507,7 @@ define <8 x float> @shuffle_v8f32_002u6u
 define <8 x float> @shuffle_v8f32_00uu66uu(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_00uu66uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,2,3,6,6,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
   ret <8 x float> %shuffle
@@ -541,8 +516,7 @@ define <8 x float> @shuffle_v8f32_00uu66
 define <8 x float> @shuffle_v8f32_103245uu(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_103245uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
   ret <8 x float> %shuffle
@@ -551,8 +525,7 @@ define <8 x float> @shuffle_v8f32_103245
 define <8 x float> @shuffle_v8f32_1133uu67(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_1133uu67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
   ret <8 x float> %shuffle
@@ -561,8 +534,7 @@ define <8 x float> @shuffle_v8f32_1133uu
 define <8 x float> @shuffle_v8f32_0uu354uu(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_0uu354uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
   ret <8 x float> %shuffle
@@ -571,8 +543,7 @@ define <8 x float> @shuffle_v8f32_0uu354
 define <8 x float> @shuffle_v8f32_uuu3uu66(<8 x float> %a, <8 x float> %b) {
 ; ALL-LABEL: @shuffle_v8f32_uuu3uu66
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,1,2,2,4,5,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
   ret <8 x float> %shuffle
@@ -956,9 +927,7 @@ define <8 x i32> @shuffle_v8i32_10225466
 define <8 x i32> @shuffle_v8i32_00015444(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00015444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,4,4,4,5]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,0,1,5,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -967,9 +936,7 @@ define <8 x i32> @shuffle_v8i32_00015444
 define <8 x i32> @shuffle_v8i32_00204644(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00204644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,2,0,0,4,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,4,6,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,0,4,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -978,9 +945,7 @@ define <8 x i32> @shuffle_v8i32_00204644
 define <8 x i32> @shuffle_v8i32_03004474(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_03004474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,3,0,4,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,7,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,3,0,0,4,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
   ret <8 x i32> %shuffle
@@ -989,9 +954,7 @@ define <8 x i32> @shuffle_v8i32_03004474
 define <8 x i32> @shuffle_v8i32_10004444(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_10004444
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,0,0,0,4,4,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,5,4,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,0,0,4,4,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1000,9 +963,7 @@ define <8 x i32> @shuffle_v8i32_10004444
 define <8 x i32> @shuffle_v8i32_22006446(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_22006446
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,0,0,2,6,4,4,6]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[2,2,0,0,6,4,4,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
   ret <8 x i32> %shuffle
@@ -1011,9 +972,7 @@ define <8 x i32> @shuffle_v8i32_22006446
 define <8 x i32> @shuffle_v8i32_33307474(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_33307474
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,0,3,0,7,4,7,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,7,7,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,3,3,0,7,4,7,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
   ret <8 x i32> %shuffle
@@ -1022,8 +981,7 @@ define <8 x i32> @shuffle_v8i32_33307474
 define <8 x i32> @shuffle_v8i32_32104567(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_32104567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[3,2,1,0,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1032,9 +990,7 @@ define <8 x i32> @shuffle_v8i32_32104567
 define <8 x i32> @shuffle_v8i32_00236744(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00236744
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,3,0,0,6,7,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,6,7,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1043,9 +999,7 @@ define <8 x i32> @shuffle_v8i32_00236744
 define <8 x i32> @shuffle_v8i32_00226644(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00226644
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,0,0,6,6,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,2,6,6,4,4]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1054,8 +1008,7 @@ define <8 x i32> @shuffle_v8i32_00226644
 define <8 x i32> @shuffle_v8i32_10324567(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_10324567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,0,3,2,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1064,8 +1017,7 @@ define <8 x i32> @shuffle_v8i32_10324567
 define <8 x i32> @shuffle_v8i32_11334567(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_11334567
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[1,1,3,3,4,5,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1074,8 +1026,7 @@ define <8 x i32> @shuffle_v8i32_11334567
 define <8 x i32> @shuffle_v8i32_01235467(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_01235467
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,7]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1084,8 +1035,7 @@ define <8 x i32> @shuffle_v8i32_01235467
 define <8 x i32> @shuffle_v8i32_01235466(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_01235466
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,2,5,4,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,1,2,3,5,4,6,6]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
   ret <8 x i32> %shuffle
@@ -1094,9 +1044,7 @@ define <8 x i32> @shuffle_v8i32_01235466
 define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_002u6u44
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,1,0,0,6,5,4,4]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
   ret <8 x i32> %shuffle
@@ -1105,9 +1053,7 @@ define <8 x i32> @shuffle_v8i32_002u6u44
 define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_00uu66uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[2,2,2,3,6,6,6,7]
-; ALL-NEXT:    vpermilps {{.*}} # ymm0 = ymm0[0,0,2,3,4,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
@@ -1116,8 +1062,7 @@ define <8 x i32> @shuffle_v8i32_00uu66uu
 define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_103245uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,3,2,5,4,7,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
@@ -1126,8 +1071,7 @@ define <8 x i32> @shuffle_v8i32_103245uu
 define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_1133uu67
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -1136,8 +1080,7 @@ define <8 x i32> @shuffle_v8i32_1133uu67
 define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_0uu354uu
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[1,0,2,3,5,4,6,7]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
   ret <8 x i32> %shuffle
@@ -1146,8 +1089,7 @@ define <8 x i32> @shuffle_v8i32_0uu354uu
 define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
 ; ALL-LABEL: @shuffle_v8i32_uuu3uu66
 ; ALL:       # BB#0:
-; ALL-NEXT:    vpermilps {{.*}} # ymm1 = ymm0[0,1,2,2,4,5,6,6]
-; ALL-NEXT:    vblendps {{.*}} # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT:    vpermilps {{.*}}, %ymm0, %ymm0
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
   ret <8 x i32> %shuffle