[llvm] r217755 - [x86] Undo a flawed transform I added to form UNPCK instructions when

Mon Sep 15 03:35:42 PDT 2014

Author: chandlerc
Date: Mon Sep 15 05:35:41 2014
New Revision: 217755

URL: http://llvm.org/viewvc/llvm-project?rev=217755&view=rev
Log:
[x86] Undo a flawed transform I added to form UNPCK instructions when
AVX is available, and generally tidy up things surrounding UNPCK
formation.

Originally, I was thinking that the only advantage of PSHUFD over UNPCK
instruction variants was its free copy, and otherwise we should use the
shorter encoding UNPCK instructions. This isn't right though, there is
a larger advantage of being able to fold a load into the operand of
a PSHUFD. For UNPCK, the operand *must* be in a register so it can be
the second input.

This removes the UNPCK formation in the target-specific DAG combine for
v4i32 shuffles. It also lifts the v8 and v16 cases out of the
AVX-specific check as they are potentially replacing multiple
instructions with a single instruction and so should always be valuable.
The floating point checks are simplified accordingly.

This also adjusts the formation of PSHUFD instructions to attempt to
match the shuffle mask to one which would fit an UNPCK instruction
variant. This was originally motivated to allow it to match the UNPCK
instructions in the combiner, but clearly won't now.

Eventually, we should add a MachineCombiner pass that can form UNPCK
instructions post-RA when the operand is known to be in a register and
thus there is no loss.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avx-basic.ll
    llvm/trunk/test/CodeGen/X86/avx-sext.ll
    llvm/trunk/test/CodeGen/X86/avx-splat.ll
    llvm/trunk/test/CodeGen/X86/exedepsfix-broadcast.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Sep 15 05:35:41 2014
@@ -7598,11 +7598,22 @@ static SDValue lowerV4I32VectorShuffle(S
   int NumV2Elements =
       std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; });
 
-  if (NumV2Elements == 0)
+  if (NumV2Elements == 0) {
     // Straight shuffle of a single input vector. For everything from SSE2
     // onward this has a single fast instruction with no scary immediates.
+    // We coerce the shuffle pattern to be compatible with UNPCK instructions
+    // but we aren't actually going to use the UNPCK instruction because doing
+    // so prevents folding a load into this instruction or making a copy.
+    const int UnpackLoMask[] = {0, 0, 1, 1};
+    const int UnpackHiMask[] = {2, 2, 3, 3};
+    if (isShuffleEquivalent(Mask, 0, 0, 1, 1))
+      Mask = UnpackLoMask;
+    else if (isShuffleEquivalent(Mask, 2, 2, 3, 3))
+      Mask = UnpackHiMask;
+
     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
                        getV4X86ShuffleImm8ForMask(Mask, DAG));
+  }
 
   // Use dedicated unpack instructions for masks that match their pattern.
   if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
@@ -19347,86 +19358,75 @@ static bool combineX86ShuffleChain(SDVal
   bool FloatDomain = VT.isFloatingPoint();
 
   // For floating point shuffles, we don't have free copies in the shuffle
-  // instructions, so this always makes sense to canonicalize.
-  //
-  // For integer shuffles, if we don't have access to VEX encodings, the generic
-  // PSHUF instructions are preferable to some of the specialized forms despite
-  // requiring one more byte to encode because they can implicitly copy.
+  // instructions or the ability to load as part of the instruction, so
+  // canonicalize their shuffles to UNPCK or MOV variants.
   //
-  // IF we *do* have VEX encodings, then we can use shorter, more specific
-  // shuffle instructions freely as they can copy due to the extra register
-  // operand.
-  if (FloatDomain || Subtarget->hasAVX()) {
-    // We have both floating point and integer variants of shuffles that dup
-    // either the low or high half of the vector.
-    if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
-      bool Lo = Mask.equals(0, 0);
-      unsigned Shuffle;
-      MVT ShuffleVT;
-      // If the input is a floating point, check if we have SSE3 which will let
-      // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
-      // option to fold the input operand into even an unaligned memory load.
-      if (FloatDomain && Lo && Subtarget->hasSSE3()) {
-        Shuffle = X86ISD::MOVDDUP;
-        ShuffleVT = MVT::v2f64;
-      } else if (FloatDomain) {
-        // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
-        // than the UNPCK variants.
-        Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
-        ShuffleVT = MVT::v4f32;
-      } else if (Subtarget->hasSSE2()) {
-        // We model everything else using UNPCK instructions. While MOVLHPS and
-        // MOVHLPS are shorter encodings they cannot accept a memory operand
-        // which overly constrains subsequent lowering.
-        Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-        ShuffleVT = MVT::v2i64;
-      } else {
-        // No available instructions here.
-        return false;
-      }
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      if (Shuffle == X86ISD::MOVDDUP)
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
-      else
-        Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
-    }
+  // Note that even with AVX we prefer the PSHUFD form of shuffle for integer
+  // vectors because it can have a load folded into it that UNPCK cannot. This
+  // doesn't preclude something switching to the shorter encoding post-RA.
+  if (FloatDomain && (Mask.equals(0, 0) || Mask.equals(1, 1))) {
+    bool Lo = Mask.equals(0, 0);
+    unsigned Shuffle;
+    MVT ShuffleVT;
+    // Check if we have SSE3 which will let us use MOVDDUP. That instruction
+    // is no slower than UNPCKLPD but has the option to fold the input operand
+    // into even an unaligned memory load.
+    if (Lo && Subtarget->hasSSE3()) {
+      Shuffle = X86ISD::MOVDDUP;
+      ShuffleVT = MVT::v2f64;
+    } else {
+      // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+      // than the UNPCK variants.
+      Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+      ShuffleVT = MVT::v4f32;
+    }
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    if (Shuffle == X86ISD::MOVDDUP)
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+    else
+      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
+  }
 
-    // FIXME: We should match UNPCKLPS and UNPCKHPS here.
+  // FIXME: We should match UNPCKLPS and UNPCKHPS here.
 
-    // For the integer domain we have specialized instructions for duplicating
-    // any element size from the low or high half.
-    if (!FloatDomain &&
-        (Mask.equals(0, 0, 1, 1) || Mask.equals(2, 2, 3, 3) ||
-         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
-         Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
-         Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
-         Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
-                     15))) {
-      bool Lo = Mask[0] == 0;
-      unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
-      if (Depth == 1 && Root->getOpcode() == Shuffle)
-        return false; // Nothing to do!
-      MVT ShuffleVT;
-      switch (Mask.size()) {
-      case 4: ShuffleVT = MVT::v4i32; break;
-      case 8: ShuffleVT = MVT::v8i16; break;
-      case 16: ShuffleVT = MVT::v16i8; break;
-      };
-      Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
-      DCI.AddToWorklist(Op.getNode());
-      Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
-      DCI.AddToWorklist(Op.getNode());
-      DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
-                    /*AddTo*/ true);
-      return true;
-    }
+  // We always canonicalize the 8 x i16 and 16 x i8 shuffles into their UNPCK
+  // variants as none of these have single-instruction variants that are
+  // superior to the UNPCK formulation.
+  if (!FloatDomain &&
+      (Mask.equals(0, 0, 1, 1, 2, 2, 3, 3) ||
+       Mask.equals(4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7) ||
+       Mask.equals(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15,
+                   15))) {
+    bool Lo = Mask[0] == 0;
+    unsigned Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+    if (Depth == 1 && Root->getOpcode() == Shuffle)
+      return false; // Nothing to do!
+    MVT ShuffleVT;
+    switch (Mask.size()) {
+    case 8:
+      ShuffleVT = MVT::v8i16;
+      break;
+    case 16:
+      ShuffleVT = MVT::v16i8;
+      break;
+    default:
+      llvm_unreachable("Impossible mask size!");
+    };
+    Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
+    DCI.AddToWorklist(Op.getNode());
+    Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+    DCI.AddToWorklist(Op.getNode());
+    DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
+                  /*AddTo*/ true);
+    return true;
   }
 
   // Don't try to re-form single instruction chains under any circumstances now

Modified: llvm/trunk/test/CodeGen/X86/avx-basic.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-basic.ll?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-basic.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-basic.ll Mon Sep 15 05:35:41 2014
@@ -72,9 +72,9 @@ entry:
   ret <4 x i64> %shuffle
 }
 
-; CHECK: vpunpcklqdq
+; CHECK: vmovlhps
 ; CHECK-NEXT: vextractf128  $1
-; CHECK-NEXT: vpunpcklqdq
+; CHECK-NEXT: vmovlhps
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @C(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp {
 entry:
@@ -83,7 +83,7 @@ entry:
 }
 
 ; CHECK: vpshufd $-96
-; CHECK: vpunpckhdq
+; CHECK: vpshufd $-6
 ; CHECK: vinsertf128 $1
 define <8 x i32> @D(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp {
 entry:

Modified: llvm/trunk/test/CodeGen/X86/avx-sext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-sext.ll?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-sext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-sext.ll Mon Sep 15 05:35:41 2014
@@ -156,7 +156,7 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x
 
 ; AVX-LABEL: sext_16i8_to_16i16
 ; AVX: vpmovsxbw
-; AVX: vpunpckhqdq
+; AVX: vmovhlps
 ; AVX: vpmovsxbw
 ; AVX: ret
 define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {

Modified: llvm/trunk/test/CodeGen/X86/avx-splat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-splat.ll?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-splat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-splat.ll Mon Sep 15 05:35:41 2014
@@ -19,7 +19,7 @@ entry:
 }
 
 ; CHECK: vmovq
-; CHECK-NEXT: vpunpcklqdq %xmm
+; CHECK-NEXT: vmovlhps %xmm
 ; CHECK-NEXT: vinsertf128 $1
 define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
 entry:

Modified: llvm/trunk/test/CodeGen/X86/exedepsfix-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/exedepsfix-broadcast.ll?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/exedepsfix-broadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/exedepsfix-broadcast.ll Mon Sep 15 05:35:41 2014
@@ -95,8 +95,8 @@ define <4 x double> @ExeDepsFix_broadcas
 ; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
 ; ExeDepsFix works top down, thus it coalesces vpunpcklqdq domain with
 ; vpand and there is nothing more you can do to match vmaxpd.
-; CHECK: vpunpcklqdq
-; CHECK: vpand
+; CHECK: vmovlhps
+; CHECK: vandps
 ; CHECK: vmaxpd
 ; CHECK: ret
 define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll Mon Sep 15 05:35:41 2014
@@ -639,7 +639,7 @@ define <8 x i16> @shuffle_v8i16_032dXXXX
 define <8 x i16> @shuffle_v8i16_XXXdXXXX(<8 x i16> %a, <8 x i16> %b) {
 ; ALL-LABEL: @shuffle_v8i16_XXXdXXXX
 ; ALL:       # BB#0:
-; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[0,2,2,3]
+; ALL-NEXT:    pshufd {{.*}} # xmm0 = xmm1[2,2,3,3]
 ; ALL-NEXT:    retq
   %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <8 x i16> %shuffle

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll?rev=217755&r1=217754&r2=217755&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v4.ll Mon Sep 15 05:35:41 2014
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-unknown"
 define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: @shuffle_v4i64_0001
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -18,7 +18,7 @@ define <4 x i64> @shuffle_v4i64_0020(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -41,7 +41,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -52,7 +52,7 @@ define <4 x i64> @shuffle_v4i64_1000(<4
 ; AVX1-LABEL: @shuffle_v4i64_1000
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -63,8 +63,8 @@ define <4 x i64> @shuffle_v4i64_2200(<4
 ; AVX1-LABEL: @shuffle_v4i64_2200
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
@@ -76,7 +76,7 @@ define <4 x i64> @shuffle_v4i64_3330(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm0 = xmm1[1],xmm0[0]
-; AVX1-NEXT:    vpunpckhqdq {{.*}} # xmm1 = xmm1[1,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[2,3,2,3]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0>
@@ -281,7 +281,7 @@ define <4 x i64> @shuffle_v4i64_0124(<4
 ; AVX1-LABEL: @shuffle_v4i64_0124
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm2[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -292,7 +292,7 @@ define <4 x i64> @shuffle_v4i64_0142(<4
 ; AVX1-LABEL: @shuffle_v4i64_0142
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm2 = xmm2[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm2[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm1 = xmm1[0],xmm2[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -304,7 +304,7 @@ define <4 x i64> @shuffle_v4i64_0412(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -316,7 +316,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vshufpd {{.*}} # xmm2 = xmm0[1],xmm2[0]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -336,7 +336,7 @@ define <4 x i64> @shuffle_v4i64_0451(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm2 = xmm2[0],xmm0[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -356,7 +356,7 @@ define <4 x i64> @shuffle_v4i64_4015(<4
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm2 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm2 = xmm2[0],xmm1[1]
-; AVX1-NEXT:    vpunpcklqdq {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm1[0],xmm0[1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -368,8 +368,7 @@ define <4 x i64> @stress_test1(<4 x i64>
 ; AVX1-LABEL: @stress_test1
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT:    vpunpckhqdq {{.*}} # xmm0 = xmm0[1,1]
-; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[2,3,2,3]
 ; AVX1-NEXT:    vblendpd {{.*}} # xmm0 = xmm0[0],xmm1[1]
 ; AVX1-NEXT:    vpshufd {{.*}} # xmm1 = xmm1[2,3,0,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0