[llvm] bd17ced - Revert "[X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats"

Thu Aug 5 10:02:58 PDT 2021

Author: Benjamin Kramer
Date: 2021-08-05T18:58:08+02:00
New Revision: bd17ced1db9a674fc8aa6632899e245672c7aa35

URL: https://github.com/llvm/llvm-project/commit/bd17ced1db9a674fc8aa6632899e245672c7aa35
DIFF: https://github.com/llvm/llvm-project/commit/bd17ced1db9a674fc8aa6632899e245672c7aa35.diff

LOG: Revert "[X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats"

This reverts commits f819e4c7d0f6efef3cc1042cc45582320bf6c0a2 and
35c0848b570214ed2b2d96cca4dd62bb7ae725cd. It triggers an infinite loop during
compilation.

$ cat t.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @MaxPoolGradGrad_1.65() local_unnamed_addr #0 {
entry:
  %wide.vec78 = load <64 x i32>, <64 x i32>* null, align 16
  %strided.vec83 = shufflevector <64 x i32> %wide.vec78, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
  %0 = lshr <8 x i32> %strided.vec83, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  %1 = add <8 x i32> zeroinitializer, %0
  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  %interleaved.vec = shufflevector <32 x i32> undef, <32 x i32> %3, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
  store <64 x i32> %interleaved.vec, <64 x i32>* undef, align 16
  unreachable
}

$ llc < t.ll -mcpu=skylake
<hang>

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx.ll
    llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/test/CodeGen/X86/pr15296.ll
    llvm/test/CodeGen/X86/sse41.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c9c22975d19aa..c8bc34e14fa18 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35796,19 +35796,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                      (RootVT.isFloatingPoint() && Depth >= 1) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
-  // How many elements does each of the inputs have, given the current
-  // granularity of the root shuffle? Note that while currently the sizes of an
-  // inputs must match the size of the shuffle root,
-  // that restriction will be lifted in the future.
-  SmallVector<unsigned, 2> InputNumElts;
-  llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
-                  std::back_inserter(InputNumElts),
-                  [BaseMaskEltSizeInBits](MVT VT) {
-                    assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
-                           "Input is not a multiple of output element width?");
-                    return VT.getSizeInBits() / BaseMaskEltSizeInBits;
-                  });
-
   // Don't combine if we are a AVX512/EVEX target and the mask element size
   // is 
diff erent from the root element size - this would prevent writemasks
   // from being reused.
@@ -35823,44 +35810,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // If we are shuffling a broadcast (and not introducing zeros) then
   // we can just use the broadcast directly. This works for smaller broadcast
   // elements as well as they already repeat across each mask element
-  SmallVector<bool, 2> InputIsSplat;
-  llvm::transform(
-      std::initializer_list<SDValue>({V1, V2}),
-      std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
-        return isTargetShuffleSplat(V) &&
-               (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
-      });
-  if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
+  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
       V1.getValueSizeInBits() >= RootSizeInBits) {
     return CanonicalizeShuffleInput(RootVT, V1);
   }
 
-  SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
-
-  // Adjust mask elements that pick from a splat input to be identity mask elts,
-  // i.e. to pick from the same lane of the input as the mask element is in.
-  // This may allow to simplify the shuffle into a blend.
-  if (InputIsSplat[0] || InputIsSplat[1]) {
-    for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
-      int &M = Mask[i];
-      assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
-             "OOB mask element?");
-      if (M < 0)
-        continue; // Keep the undef/zero mask elements as-is.
-      int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
-      // Is the used input wide-enough to contain that lane, and is it a splat?
-      if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
-        M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
-    }
-  }
-
   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
   // etc. can be simplified.
   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
     SmallVector<int> ScaledMask, IdentityMask;
     unsigned NumElts = VT1.getVectorNumElements();
-    if (Mask.size() <= NumElts &&
-        scaleShuffleElements(Mask, NumElts, ScaledMask)) {
+    if (BaseMask.size() <= NumElts &&
+        scaleShuffleElements(BaseMask, NumElts, ScaledMask)) {
       for (unsigned i = 0; i != NumElts; ++i)
         IdentityMask.push_back(i);
       if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
@@ -35874,14 +35836,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // If the upper subvectors are zeroable, then an extract+insert is more
     // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
     // to zero the upper subvectors.
-    if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
+    if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
-      assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
+      assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
              "Unexpected lane shuffle");
       Res = CanonicalizeShuffleInput(RootVT, V1);
-      unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
-      bool UseZero = isAnyZero(Mask);
+      unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts);
+      bool UseZero = isAnyZero(BaseMask);
       Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
       return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
     }
@@ -35889,7 +35851,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // Narrow shuffle mask to v4x128.
     SmallVector<int, 4> ScaledMask;
     assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
-    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
+    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, ScaledMask);
 
     // Try to lower to vshuf64x2/vshuf32x4.
     auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
@@ -35948,20 +35910,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // If the upper half is zeroable, then an extract+insert is more optimal
     // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
     // zero the upper half.
-    if (isUndefOrZero(Mask[1])) {
+    if (isUndefOrZero(BaseMask[1])) {
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
-      assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
+      assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
       Res = CanonicalizeShuffleInput(RootVT, V1);
-      Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
-      return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
-                            256);
+      Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL);
+      return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+                            DL, 256);
     }
 
     // If we're splatting the low subvector, an insert-subvector 'concat'
     // pattern is quicker than VPERM2X128.
     // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
-    if (Mask[0] == 0 && Mask[1] == 0 && !Subtarget.hasAVX2()) {
+    if (BaseMask[0] == 0 && BaseMask[1] == 0 && !Subtarget.hasAVX2()) {
       if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
         return SDValue(); // Nothing to do!
       Res = CanonicalizeShuffleInput(RootVT, V1);
@@ -35976,11 +35938,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     // we need to use the zeroing feature.
     // Prefer blends for sequential shuffles unless we are optimizing for size.
     if (UnaryShuffle &&
-        !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
-        (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
+        !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+        (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
       unsigned PermMask = 0;
-      PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
-      PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
+      PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+      PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
       return DAG.getNode(
           X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
           DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
@@ -35991,15 +35953,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
     // TODO - handle AVX512VL cases with X86ISD::SHUF128.
     if (!UnaryShuffle && !IsMaskedShuffle) {
-      assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
+      assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
              "Unexpected shuffle sentinel value");
       // Prefer blends to X86ISD::VPERM2X128.
-      if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
+      if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+            (BaseMask[0] == 2 && BaseMask[1] == 1))) {
         unsigned PermMask = 0;
-        PermMask |= ((Mask[0] & 3) << 0);
-        PermMask |= ((Mask[1] & 3) << 4);
-        SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
-        SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
+        PermMask |= ((BaseMask[0] & 3) << 0);
+        PermMask |= ((BaseMask[1] & 3) << 4);
+        SDValue LHS = isInRange(BaseMask[0], 0, 2) ? V1 : V2;
+        SDValue RHS = isInRange(BaseMask[1], 0, 2) ? V1 : V2;
         return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
                           CanonicalizeShuffleInput(RootVT, LHS),
                           CanonicalizeShuffleInput(RootVT, RHS),
@@ -36010,12 +35973,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   // For masks that have been widened to 128-bit elements or more,
   // narrow back down to 64-bit elements.
+  SmallVector<int, 64> Mask;
   if (BaseMaskEltSizeInBits > 64) {
     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
     int MaskScale = BaseMaskEltSizeInBits / 64;
-    SmallVector<int, 64> ScaledMask;
-    narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
-    Mask = std::move(ScaledMask);
+    narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
+  } else {
+    Mask.assign(BaseMask.begin(), BaseMask.end());
   }
 
   // For masked shuffles, we're trying to match the root width for better

diff  --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index b542f173ae982..a176edba13aa9 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
-; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
-; X32-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
+; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
+; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
 ; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
@@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
-; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
-; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
-; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
+; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
+; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
 ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index a763f92b76409..71682094d64e9 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,2,7]
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,5,2,7]
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}

diff  --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll
index f957557130008..71034f696429c 100644
--- a/llvm/test/CodeGen/X86/pr15296.ll
+++ b/llvm/test/CodeGen/X86/pr15296.ll
@@ -26,11 +26,28 @@ allocas:
 define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
 ; CHECK-LABEL: shiftInput___canonical:
 ; CHECK:       # %bb.0: # %allocas
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vpsrld %xmm2, %xmm1, %xmm1
-; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT:    vpsrld %xmm2, %xmm3, %xmm4
+; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm5
+; CHECK-NEXT:    vpsrld %xmm5, %xmm3, %xmm6
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; CHECK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
+; CHECK-NEXT:    vpsrld %xmm6, %xmm3, %xmm7
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; CHECK-NEXT:    vpsrld %xmm1, %xmm3, %xmm3
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
+; CHECK-NEXT:    vpsrld %xmm5, %xmm0, %xmm4
+; CHECK-NEXT:    vpsrld %xmm6, %xmm0, %xmm5
+; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 allocas:
   %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0

diff  --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 1a1b976c45403..17aae3373c5ec 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
-; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
-; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
-; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
 ; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
-; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
-; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
-; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
+; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
-; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
-; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
-; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
 ; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
@@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
-; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
-; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
-; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
-; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
-; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
-; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
+; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
+; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
+; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
+; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
+; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
+; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = getelementptr inbounds float, float* %fb, i64 %index

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 96bcaa1a0d443..d280580f55f1b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: