[llvm] r218430 - [x86] Implement v16i16 support with AVX2 in the new vector shuffle

Wed Sep 24 17:24:20 PDT 2014

Author: chandlerc
Date: Wed Sep 24 19:24:19 2014
New Revision: 218430

URL: http://llvm.org/viewvc/llvm-project?rev=218430&view=rev
Log:
[x86] Implement v16i16 support with AVX2 in the new vector shuffle
lowering.

This also implements the fancy blend lowering for v16i16 using AVX2 and
teaches the X86 backend to print shuffle masks for 256-bit PSHUFB
and PBLENDW instructions. It also makes the mask decoding correct for
PBLENDW instructions. The yaks, they are legion.

Tests are updated accordingly. There are some missing tests for the
VBLENDVB lowering, but I'll add those in a follow-up as this commit has
accumulated enough cruft already.

Modified:
    llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp
    llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86MCInstLower.cpp
    llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll

Modified: llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp?rev=218430&r1=218429&r2=218430&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp (original)
+++ llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp Wed Sep 24 19:24:19 2014
@@ -102,6 +102,17 @@ bool llvm::EmitAnyX86InstComments(const
     Src1Name = getRegName(MI->getOperand(1).getReg());
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VPBLENDWYrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VPBLENDWYrmi:
+    if(MI->getOperand(MI->getNumOperands()-1).isImm())
+      DecodeBLENDMask(MVT::v16i16,
+                      MI->getOperand(MI->getNumOperands()-1).getImm(),
+                      ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
 
   case X86::VPBLENDDrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());

Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp?rev=218430&r1=218429&r2=218430&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp Wed Sep 24 19:24:19 2014
@@ -301,11 +301,18 @@ void DecodePSHUFBMask(ArrayRef<uint64_t>
   }
 }
 
-void DecodeBLENDMask(MVT VT, unsigned Imm,
-                       SmallVectorImpl<int> &ShuffleMask) {
+void DecodeBLENDMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
+  int ElementBits = VT.getScalarSizeInBits();
   int NumElements = VT.getVectorNumElements();
-  for (int i = 0; i < NumElements; ++i)
-    ShuffleMask.push_back(((Imm >> i) & 1) ? NumElements + i : i);
+  for (int i = 0; i < NumElements; ++i) {
+    // If there are more than 8 elements in the vector, then any immediate blend
+    // mask applies to each 128-bit lane. There can never be more than
+    // 8 elements in a 128-bit lane with an immediate blend.
+    int Bit = NumElements > 8 ? i % (128 / ElementBits) : i;
+    assert(Bit < 8 &&
+           "Immediate blends only operate over 8 elements at a time!");
+    ShuffleMask.push_back(((Imm >> Bit) & 1) ? NumElements + i : i);
+  }
 }
 
 /// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=218430&r1=218429&r2=218430&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Sep 24 19:24:19 2014
@@ -7183,6 +7183,56 @@ static bool isSingleInputShuffleMask(Arr
   return true;
 }
 
+/// \brief Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+///
+/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
+/// and we routinely test for these.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i)
+    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+      return true;
+  return false;
+}
+
+/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
+///
+/// This checks a shuffle mask to see if it is performing the same
+/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
+/// that it is also not lane-crossing. It may however involve a blend from the
+/// same lane of a second vector.
+///
+/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
+/// non-trivial to compute in the face of undef lanes. The representation is
+/// *not* suitable for use with existing 128-bit shuffles as it will contain
+/// entries from both V1 and V2 inputs to the wider mask.
+static bool
+is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &RepeatedMask) {
+  int LaneSize = 128 / VT.getScalarSizeInBits();
+  RepeatedMask.resize(LaneSize, -1);
+  int Size = Mask.size();
+  for (int i = 0; i < Size; ++i) {
+    if (Mask[i] < 0)
+      continue;
+    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+      // This entry crosses lanes, so there is no way to model this shuffle.
+      return false;
+
+    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+    if (RepeatedMask[i % LaneSize] == -1)
+      // This is the first non-undef entry in this slot of a 128-bit lane.
+      RepeatedMask[i % LaneSize] =
+          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
+    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
+      // Found a mismatch with the repeated mask.
+      return false;
+  }
+  return true;
+}
+
 // Hide this symbol with an anonymous namespace instead of 'static' so that MSVC
 // 2013 will allow us to use it as a non-type template parameter.
 namespace {
@@ -7312,6 +7362,38 @@ static SDValue lowerVectorShuffleAsBlend
                                    DAG.getConstant(BlendMask, MVT::i8)));
   }
 
+  case MVT::v16i16: {
+    assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
+    SmallVector<int, 8> RepeatedMask;
+    if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+      // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
+      assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
+      BlendMask = 0;
+      for (int i = 0; i < 8; ++i)
+        if (RepeatedMask[i] >= 16)
+          BlendMask |= 1u << i;
+      return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
+                         DAG.getConstant(BlendMask, MVT::i8));
+    }
+
+    // Fall back to a fully general variable byte blend.
+    SDValue PBLENDVMask[32];
+    // Scale the blend by the number of bytes per element.
+    int Scale =  VT.getScalarSizeInBits() / 8;
+    assert(Mask.size() * Scale == 32 && "Not a 256-bit vector!");
+    for (int i = 0, Size = Mask.size(); i < Size; ++i)
+      for (int j = 0; j < Scale; ++j)
+        PBLENDVMask[Scale * i + j] =
+            Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
+                        : DAG.getConstant(Mask[i] < Size ? 0 : 0x80, MVT::i8);
+
+    V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1);
+    V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V2);
+    return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(
+        X86ISD::BLENDV, DL, MVT::v32i8, V1, V2,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PBLENDVMask)));
+  }
+
   default:
     llvm_unreachable("Not a supported integer vector type!");
   }
@@ -9215,56 +9297,6 @@ static SDValue lower128BitVectorShuffle(
   }
 }
 
-/// \brief Test whether there are elements crossing 128-bit lanes in this
-/// shuffle mask.
-///
-/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
-/// and we routinely test for these.
-static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
-  int LaneSize = 128 / VT.getScalarSizeInBits();
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i)
-    if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
-      return true;
-  return false;
-}
-
-/// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
-///
-/// This checks a shuffle mask to see if it is performing the same
-/// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
-/// that it is also not lane-crossing. It may however involve a blend from the
-/// same lane of a second vector.
-///
-/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
-/// non-trivial to compute in the face of undef lanes. The representation is
-/// *not* suitable for use with existing 128-bit shuffles as it will contain
-/// entries from both V1 and V2 inputs to the wider mask.
-static bool
-is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
-                                SmallVectorImpl<int> &RepeatedMask) {
-  int LaneSize = 128 / VT.getScalarSizeInBits();
-  RepeatedMask.resize(LaneSize, -1);
-  int Size = Mask.size();
-  for (int i = 0; i < Size; ++i) {
-    if (Mask[i] < 0)
-      continue;
-    if ((Mask[i] % Size) / LaneSize != i / LaneSize)
-      // This entry crosses lanes, so there is no way to model this shuffle.
-      return false;
-
-    // Ok, handle the in-lane shuffles by detecting if and when they repeat.
-    if (RepeatedMask[i % LaneSize] == -1)
-      // This is the first non-undef entry in this slot of a 128-bit lane.
-      RepeatedMask[i % LaneSize] =
-          Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
-    else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
-      // Found a mismatch with the repeated mask.
-      return false;
-  }
-  return true;
-}
-
 /// \brief Generic routine to split a 256-bit vector shuffle into 128-bit
 /// shuffles.
 ///
@@ -9581,9 +9613,74 @@ static SDValue lowerV16I16VectorShuffle(
   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
   assert(Subtarget->hasAVX2() && "We can only lower v16i16 with AVX2!");
 
-  // FIXME: Actually implement this using AVX2!!!
-  (void)Mask;
-  return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+  if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
+                                                Subtarget, DAG))
+    return Blend;
+
+  // If the shuffle mask is repeated in each 128-bit lane we can use more
+  // efficient instructions that mirror the shuffles across the two 128-bit
+  // lanes.
+  SmallVector<int, 4> RepeatedMask;
+  if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
+    assert(RepeatedMask.size() == 8 && "Unexpected repeated mask size!");
+    // FIXME: It might be worth it to call into the (terribly complex) v8i16
+    // lowering here.
+
+    // Use dedicated unpack instructions for masks that match their pattern.
+    //
+    if (isShuffleEquivalent(Mask,
+                            // First 128-bit lane:
+                            0, 16, 1, 17, 2, 18, 3, 19,
+                            // Second 128-bit lane:
+                            8, 24, 9, 25, 10, 26, 11, 27))
+      return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i16, V1, V2);
+    if (isShuffleEquivalent(Mask,
+                            // First 128-bit lane:
+                            4,  20, 5,  21, 6, 22, 7, 23,
+                            // Second 128-bit lane:
+                            12, 28, 13, 29, 14, 30, 15, 31))
+      return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i16, V1, V2);
+  }
+
+  // There are no generalized cross-lane shuffle operations available on i16
+  // element types.
+  // FIXME: We should teach the "split and lower" path to do something more
+  // clever, or do it ourselves here. The optimal lowering of cross-lane
+  // shuffles I am aware of is to swap the lanes into a copy, shuffle both the
+  // original and the copy, and then blend to pick up the cross-lane elements.
+  // This is four instructions with a tree height of three which is better than
+  // the worst case for a gather-cross-scatter approach such as used in SSE2
+  // v8i16 lowering (where we don't have blends). While for cross-lane blends it
+  // results in a blend tree, blends are very cheap in AVX2 and newer chips. We
+  // might also want to special case situations where we can always do a single
+  // VPERMD to produce a non-lane-crossing shuffle.
+  if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
+    return splitAndLower256BitVectorShuffle(Op, V1, V2, Subtarget, DAG);
+
+  if (isSingleInputShuffleMask(Mask)) {
+    SDValue PSHUFBMask[32];
+    for (int i = 0; i < 16; ++i) {
+      if (Mask[i] == -1) {
+        PSHUFBMask[2 * i] = PSHUFBMask[2 * i + 1] = DAG.getUNDEF(MVT::i8);
+        continue;
+      }
+
+      int M = i < 8 ? Mask[i] : Mask[i] - 8;
+      assert(M >= 0 && M < 8 && "Invalid single-input mask!");
+      PSHUFBMask[2 * i] = DAG.getConstant(2 * M, MVT::i8);
+      PSHUFBMask[2 * i + 1] = DAG.getConstant(2 * M + 1, MVT::i8);
+    }
+    return DAG.getNode(
+        ISD::BITCAST, DL, MVT::v16i16,
+        DAG.getNode(
+            X86ISD::PSHUFB, DL, MVT::v32i8,
+            DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, V1),
+            DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, PSHUFBMask)));
+  }
+
+  // Otherwise fall back on generic blend lowering.
+  return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i16, V1, V2,
+                                                    Mask, DAG);
 }
 
 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.

Modified: llvm/trunk/lib/Target/X86/X86MCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86MCInstLower.cpp?rev=218430&r1=218429&r2=218430&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86MCInstLower.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86MCInstLower.cpp Wed Sep 24 19:24:19 2014
@@ -1098,7 +1098,8 @@ void X86AsmPrinter::EmitInstruction(cons
     // a constant shuffle mask. We won't be able to do this at the MC layer
     // because the mask isn't an immediate.
   case X86::PSHUFBrm:
-  case X86::VPSHUFBrm: {
+  case X86::VPSHUFBrm:
+  case X86::VPSHUFBYrm: {
     if (!OutStreamer.isVerboseAsm())
       break;
     assert(MI->getNumOperands() > 5 &&

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll?rev=218430&r1=218429&r2=218430&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll Wed Sep 24 19:24:19 2014
@@ -385,11 +385,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -407,11 +403,7 @@ define <16 x i16> @shuffle_v16i16_07_07_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i16> %shuffle
@@ -430,12 +422,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_04_04_04_04_08_08_08_08_12_12_12_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,8,9,8,9,8,9,8,9,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
@@ -454,12 +441,7 @@ define <16 x i16> @shuffle_v16i16_03_03_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_03_03_03_03_07_07_07_07_11_11_11_11_15_15_15_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[3,3,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,7,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[6,7,6,7,6,7,6,7,14,15,14,15,14,15,14,15,22,23,22,23,22,23,22,23,30,31,30,31,30,31,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7, i32 11, i32 11, i32 11, i32 11, i32 15, i32 15, i32 15, i32 15>
   ret <16 x i16> %shuffle
@@ -478,12 +460,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_08_08_10_10_12_12_14_14
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,16,17,16,17,20,21,20,21,24,25,24,25,28,29,28,29]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   ret <16 x i16> %shuffle
@@ -502,12 +479,7 @@ define <16 x i16> @shuffle_v16i16_01_01_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_01_01_03_03_05_05_07_07_09_09_11_11_13_13_15_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[1,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,5,5,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[1,1,3,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,5,5,7,7]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15,18,19,18,19,22,23,22,23,26,27,26,27,30,31,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
   ret <16 x i16> %shuffle
@@ -637,11 +609,7 @@ define <16 x i16> @shuffle_v16i16_16_01_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
   ret <16 x i16> %shuffle
@@ -684,16 +652,9 @@ define <16 x i16> @shuffle_v16i16_00_16_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm3 = [0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vpshufd {{.*}} # xmm4 = xmm4[0,0,0,0]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,0,0,0]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,0,1,u,u,0,1,u,u,0,1,u,u,16,17,u,u,16,17,u,u,16,17,u,u,16,17]
+; AVX2-NEXT:    vpshufd {{.*}} # ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 0, i32 16, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24, i32 8, i32 24>
   ret <16 x i16> %shuffle
@@ -713,13 +674,8 @@ define <16 x i16> @shuffle_v16i16_16_16_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm3 = xmm3[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[0,1,0,1,0,1,0,1,u,u,u,u,u,u,u,u,16,17,16,17,16,17,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 4, i32 5, i32 6, i32 7, i32 24, i32 24, i32 24, i32 24, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle
@@ -741,15 +697,9 @@ define <16 x i16> @shuffle_v16i16_19_18_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm3 = xmm3[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,14,15,12,13,10,11,8,9,u,u,u,u,u,u,u,u,30,31,28,29,26,27,24,25]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 7, i32 6, i32 5, i32 4, i32 27, i32 26, i32 25, i32 24, i32 15, i32 14, i32 13, i32 12>
   ret <16 x i16> %shuffle
@@ -773,17 +723,9 @@ define <16 x i16> @shuffle_v16i16_19_18_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_19_18_17_16_03_02_01_00_27_26_25_24_11_10_09_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm2 = xmm2[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpshufd {{.*}} # xmm3 = xmm3[0,1,0,1]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm2 = xmm2[0,1],xmm3[2,3]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[3,2,1,0,4,5,6,7]
-; AVX2-NEXT:    vpshufd {{.*}} # xmm0 = xmm0[0,1,0,1]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; AVX2-NEXT:    vpblendd {{.*}} # xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[u,u,u,u,u,u,u,u,6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[6,7,4,5,2,3,0,1,u,u,u,u,u,u,u,u,22,23,20,21,18,19,16,17,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpblendd {{.*}} # ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 19, i32 18, i32 17, i32 16, i32 3, i32 2, i32 1, i32 0, i32 27, i32 26, i32 25, i32 24, i32 11, i32 10, i32 9, i32 8>
   ret <16 x i16> %shuffle
@@ -801,11 +743,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,16,17,16,17,16,17,16,17,16,17,18,19,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 9, i32 8>
   ret <16 x i16> %shuffle
@@ -823,11 +761,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,16,17,16,17,16,17,20,21,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 10, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -845,11 +779,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,16,17,22,23,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -867,11 +797,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_12_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,24,25,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -889,11 +815,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_13_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,26,27,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 13, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -911,11 +833,7 @@ define <16 x i16> @shuffle_v16i16_00_06_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_14_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,28,29,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 14, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -933,11 +851,7 @@ define <16 x i16> @shuffle_v16i16_07_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_15_08_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovdqa {{.*}} # xmm2 = [14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,30,31,16,17,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -959,15 +873,7 @@ define <16 x i16> @shuffle_v16i16_00_16_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_08_24_09_25_10_26_11_27
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpmovzxwd %xmm3, %xmm3
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vpmovzxwd %xmm0, %xmm0
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklwd {{.*}} # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   ret <16 x i16> %shuffle
@@ -989,15 +895,7 @@ define <16 x i16> @shuffle_v16i16_04_20_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_12_28_13_29_14_30_15_31
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhwd {{.*}} # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <16 x i16> %shuffle
@@ -1019,15 +917,9 @@ define <16 x i16> @shuffle_v16i16_00_16_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_16_01_17_02_18_03_19_12_28_13_29_14_30_15_31
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm2 = xmm2[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm3 = xmm3[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm1 = xmm1[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vpmovzxwd %xmm0, %xmm0
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,2,3,u,u,4,5,u,u,6,7,u,u,24,25,u,u,26,27,u,u,28,29,u,u,30,31,u,u]
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <16 x i16> %shuffle
@@ -1049,15 +941,9 @@ define <16 x i16> @shuffle_v16i16_04_20_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_20_05_21_06_22_07_23_08_24_09_25_10_26_11_27
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpunpcklwd {{.*}} # xmm2 = xmm2[0,0,1,1,2,2,3,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpmovzxwd %xmm3, %xmm3
-; AVX2-NEXT:    vpblendw {{.*}} # xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpunpckhwd {{.*}} # xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-NEXT:    vpblendw {{.*}} # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm1 = ymm1[u,u,8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23]
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[8,9,u,u,10,11,u,u,12,13,u,u,14,15,u,u,16,17,u,u,18,19,u,u,20,21,u,u,22,23,u,u]
+; AVX2-NEXT:    vpblendw {{.*}} # ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   ret <16 x i16> %shuffle
@@ -1074,10 +960,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_00_01_00_08_09_08_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1,16,17,18,19,16,17,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 8, i32 9, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1094,10 +977,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_10_08_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1,16,17,16,17,20,21,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0, i32 8, i32 8, i32 10, i32 8, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1114,10 +994,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_11_08_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,6,7,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1,16,17,16,17,16,17,22,23,16,17,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 11, i32 8, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1134,10 +1011,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_00_04_00_00_00_00_08_08_08_08_12_08_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,8,9,0,1,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,16,17,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 8, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1154,10 +1028,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_05_00_00_00_00_00_08_08_08_08_08_13_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,26,27,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 13, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1174,10 +1045,7 @@ define <16 x i16> @shuffle_v16i16_00_06_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_06_00_00_00_00_00_00_08_08_08_08_08_08_14_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,28,29,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 14, i32 8>
   ret <16 x i16> %shuffle
@@ -1194,10 +1062,7 @@ define <16 x i16> @shuffle_v16i16_07_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_00_00_00_00_00_00_00_08_08_08_08_08_08_08_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,14,15]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 15>
   ret <16 x i16> %shuffle
@@ -1215,11 +1080,7 @@ define <16 x i16> @shuffle_v16i16_00_00_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_00_02_02_04_04_06_06_14_14_12_12_10_10_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,0,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,8,9,8,9,4,5,4,5,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13,28,29,28,29,24,25,24,25,20,21,20,21,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 14, i32 14, i32 12, i32 12, i32 10, i32 10, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1237,11 +1098,7 @@ define <16 x i16> @shuffle_v16i16_04_04_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_00_00_00_00_08_08_08_08_12_12_12_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,24,25,24,25,24,25,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 12, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle
@@ -1258,10 +1115,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,28,29,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 14, i32 8>
   ret <16 x i16> %shuffle
@@ -1278,10 +1132,7 @@ define <16 x i16> @shuffle_v16i16_07_uu_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_07_uu_00_00_00_00_00_00_08_08_uu_uu_08_08_08_15
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*}} # xmm1 = xmm0[14,15,2,3,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,0,1,14,15]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[14,15,u,u,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,u,u,u,u,16,17,16,17,16,17,30,31]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 7, i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 15>
   ret <16 x i16> %shuffle
@@ -1299,11 +1150,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_00_uu_uu_02_04_04_uu_06_14_14_uu_12_10_10_08_08
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm0[0,1,2,2,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm1 = xmm1[0,1,2,3,4,4,6,6]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*}} # xmm0 = xmm0[12,13,12,13,12,13,8,9,4,5,4,5,0,1,0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[0,1,u,u,u,u,4,5,8,9,8,9,u,u,12,13,28,29,28,29,u,u,24,25,20,21,20,21,16,17,16,17]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 undef, i32 undef, i32 2, i32 4, i32 4, i32 undef, i32 6, i32 14, i32 14, i32 undef, i32 12, i32 10, i32 10, i32 8, i32 8>
   ret <16 x i16> %shuffle
@@ -1322,12 +1169,7 @@ define <16 x i16> @shuffle_v16i16_04_04_
 ;
 ; AVX2-LABEL: @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufd {{.*}} # xmm1 = xmm0[2,1,2,3]
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*}} # xmm0 = xmm0[0,0,0,3,4,5,6,7]
-; AVX2-NEXT:    vpshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpshufb {{.*}} # ymm0 = ymm0[8,9,8,9,8,9,8,9,u,u,u,u,u,u,u,u,16,17,16,17,16,17,u,u,u,u,24,25,24,25,24,25]
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 undef, i32 undef, i32 undef, i32 undef, i32 8, i32 8, i32 8, i32 undef, i32 undef, i32 12, i32 12, i32 12>
   ret <16 x i16> %shuffle