[llvm] f819e4c - [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats

Wed Aug 4 06:55:16 PDT 2021

Author: Roman Lebedev
Date: 2021-08-04T16:55:04+03:00
New Revision: f819e4c7d0f6efef3cc1042cc45582320bf6c0a2

URL: https://github.com/llvm/llvm-project/commit/f819e4c7d0f6efef3cc1042cc45582320bf6c0a2
DIFF: https://github.com/llvm/llvm-project/commit/f819e4c7d0f6efef3cc1042cc45582320bf6c0a2.diff

LOG: [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats

Given a shuffle mask, if it is picking from an input that is splat
given the current granularity of the shuffle, then adjust the mask
to pick from the same lane of the input as the mask element is in.
This may result in a shuffle being simplified into a blend.

I believe this is correct given that the splat detection matches the one
just above the new code,

My basic thought is that we might be able to get less regressions
by handling multiple insertions of the same value into a vector
if we form broadcasts+blend here, as opposed to D105390,
but i have not really thought this through,
and did not try implementing it yet.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D107009

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/avx.ll
    llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/test/CodeGen/X86/pr15296.ll
    llvm/test/CodeGen/X86/sse41.ll
    llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 144c81b3ebebf..b435f13632c79 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
                      (RootVT.isFloatingPoint() && Depth >= 1) ||
                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
 
+  // How many elements does each of the inputs have, given the current
+  // granularity of the root shuffle? Note that while currently the sizes of an
+  // inputs must match the size of the shuffle root,
+  // that restriction will be lifted in the future.
+  SmallVector<unsigned, 2> InputNumElts;
+  llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
+                  std::back_inserter(InputNumElts),
+                  [BaseMaskEltSizeInBits](MVT VT) {
+                    assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
+                           "Input is not a multiple of output element width?");
+                    return VT.getSizeInBits() / BaseMaskEltSizeInBits;
+                  });
+
   // Don't combine if we are a AVX512/EVEX target and the mask element size
   // is 
diff erent from the root element size - this would prevent writemasks
   // from being reused.
@@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // If we are shuffling a broadcast (and not introducing zeros) then
   // we can just use the broadcast directly. This works for smaller broadcast
   // elements as well as they already repeat across each mask element
-  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
-      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+  SmallVector<bool, 2> InputIsSplat;
+  llvm::transform(
+      std::initializer_list<SDValue>({V1, V2}),
+      std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
+        return isTargetShuffleSplat(V) &&
+               (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
+      });
+  if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
       V1.getValueSizeInBits() >= RootSizeInBits) {
     return CanonicalizeShuffleInput(RootVT, V1);
   }
 
+  // Adjust mask elements that pick from a splat input to be identity mask elts,
+  // i.e. to pick from the same lane of the input as the mask element is in.
+  // This may allow to simplify the shuffle into a blend.
+  SmallVector<int> NewMask;
+  if (InputIsSplat[0] || InputIsSplat[1]) {
+    NewMask.assign(BaseMask.begin(), BaseMask.end());
+    for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
+      int &M = NewMask[i];
+      assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
+             "OOB mask element?");
+      if (M < 0)
+        continue; // Keep the undef/zero mask elements as-is.
+      int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
+      // Is the used input wide-enough to contain that lane, and is it a splat?
+      if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
+        M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
+    }
+    BaseMask = std::move(NewMask);
+  }
+
   // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
   // etc. can be simplified.
   if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {

diff  --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index a176edba13aa9..b542f173ae982 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X32-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 ; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    retl
@@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 71682094d64e9..a763f92b76409 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,2,7]
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
 ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
@@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
 ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
 ; CHECK-FAST:       # %bb.0:
 ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
-; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
+; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,5,2,7]
 ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
 ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}

diff  --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll
index 71034f696429c..f957557130008 100644
--- a/llvm/test/CodeGen/X86/pr15296.ll
+++ b/llvm/test/CodeGen/X86/pr15296.ll
@@ -26,28 +26,11 @@ allocas:
 define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
 ; CHECK-LABEL: shiftInput___canonical:
 ; CHECK:       # %bb.0: # %allocas
-; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
-; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT:    vpsrld %xmm2, %xmm3, %xmm4
-; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm5
-; CHECK-NEXT:    vpsrld %xmm5, %xmm3, %xmm6
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
-; CHECK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
-; CHECK-NEXT:    vpsrld %xmm6, %xmm3, %xmm7
-; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; CHECK-NEXT:    vpsrld %xmm1, %xmm3, %xmm3
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
-; CHECK-NEXT:    vpsrld %xmm5, %xmm0, %xmm4
-; CHECK-NEXT:    vpsrld %xmm6, %xmm0, %xmm5
-; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vpsrld %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 allocas:
   %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0

diff  --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 17aae3373c5ec..1a1b976c45403 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
 ; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
@@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = getelementptr inbounds float, float* %fb, i64 %index

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d280580f55f1b..96bcaa1a0d443 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
-; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512VLBW-NEXT:    retq
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: