[llvm] f819e4c - [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats

Thu Aug 5 10:03:52 PDT 2021

This causes infinite loops during compilation, reverted in
https://github.com/llvm/llvm-project/commit/bd17ced1db9a674fc8aa6632899e245672c7aa35

Test case:

$ cat t.ll
target datalayout =
"e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @MaxPoolGradGrad_1.65() local_unnamed_addr #0 {
entry:
  %wide.vec78 = load <64 x i32>, <64 x i32>* null, align 16
  %strided.vec83 = shufflevector <64 x i32> %wide.vec78, <64 x i32>
poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32
52, i32 60>
  %0 = lshr <8 x i32> %strided.vec83, <i32 16, i32 16, i32 16, i32 16,
i32 16, i32 16, i32 16, i32 16>
  %1 = add <8 x i32> zeroinitializer, %0
  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0,
i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10,
i32 11, i32 12, i32 13, i32 14, i32 15>
  %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <32 x i32> <i32
0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32
10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18,
i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32
27, i32 28, i32 29, i32 30, i32 31>
  %interleaved.vec = shufflevector <32 x i32> undef, <32 x i32> %3,
<64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32
56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32
2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32
11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32
20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32
29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32
38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32
47, i32 55, i32 63>
  store <64 x i32> %interleaved.vec, <64 x i32>* undef, align 16
  unreachable
}

$ llc < t.ll -mcpu=skylake
<hang>

On Wed, Aug 4, 2021 at 3:55 PM Roman Lebedev via llvm-commits
<llvm-commits at lists.llvm.org> wrote:
>
>
> Author: Roman Lebedev
> Date: 2021-08-04T16:55:04+03:00
> New Revision: f819e4c7d0f6efef3cc1042cc45582320bf6c0a2
>
> URL: https://github.com/llvm/llvm-project/commit/f819e4c7d0f6efef3cc1042cc45582320bf6c0a2
> DIFF: https://github.com/llvm/llvm-project/commit/f819e4c7d0f6efef3cc1042cc45582320bf6c0a2.diff
>
> LOG: [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats
>
> Given a shuffle mask, if it is picking from an input that is splat
> given the current granularity of the shuffle, then adjust the mask
> to pick from the same lane of the input as the mask element is in.
> This may result in a shuffle being simplified into a blend.
>
> I believe this is correct given that the splat detection matches the one
> just above the new code,
>
> My basic thought is that we might be able to get less regressions
> by handling multiple insertions of the same value into a vector
> if we form broadcasts+blend here, as opposed to D105390,
> but i have not really thought this through,
> and did not try implementing it yet.
>
> Reviewed By: RKSimon
>
> Differential Revision: https://reviews.llvm.org/D107009
>
> Added:
>
>
> Modified:
>     llvm/lib/Target/X86/X86ISelLowering.cpp
>     llvm/test/CodeGen/X86/avx.ll
>     llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
>     llvm/test/CodeGen/X86/pr15296.ll
>     llvm/test/CodeGen/X86/sse41.ll
>     llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
>
> Removed:
>
>
>
> ################################################################################
> diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
> index 144c81b3ebebf..b435f13632c79 100644
> --- a/llvm/lib/Target/X86/X86ISelLowering.cpp
> +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
> @@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
>                       (RootVT.isFloatingPoint() && Depth >= 1) ||
>                       (RootVT.is256BitVector() && !Subtarget.hasAVX2());
>
> +  // How many elements does each of the inputs have, given the current
> +  // granularity of the root shuffle? Note that while currently the sizes of an
> +  // inputs must match the size of the shuffle root,
> +  // that restriction will be lifted in the future.
> +  SmallVector<unsigned, 2> InputNumElts;
> +  llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
> +                  std::back_inserter(InputNumElts),
> +                  [BaseMaskEltSizeInBits](MVT VT) {
> +                    assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
> +                           "Input is not a multiple of output element width?");
> +                    return VT.getSizeInBits() / BaseMaskEltSizeInBits;
> +                  });
> +
>    // Don't combine if we are a AVX512/EVEX target and the mask element size
>    // is
> diff erent from the root element size - this would prevent writemasks
>    // from being reused.
> @@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
>    // If we are shuffling a broadcast (and not introducing zeros) then
>    // we can just use the broadcast directly. This works for smaller broadcast
>    // elements as well as they already repeat across each mask element
> -  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
> -      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
> +  SmallVector<bool, 2> InputIsSplat;
> +  llvm::transform(
> +      std::initializer_list<SDValue>({V1, V2}),
> +      std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
> +        return isTargetShuffleSplat(V) &&
> +               (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
> +      });
> +  if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
>        V1.getValueSizeInBits() >= RootSizeInBits) {
>      return CanonicalizeShuffleInput(RootVT, V1);
>    }
>
> +  // Adjust mask elements that pick from a splat input to be identity mask elts,
> +  // i.e. to pick from the same lane of the input as the mask element is in.
> +  // This may allow to simplify the shuffle into a blend.
> +  SmallVector<int> NewMask;
> +  if (InputIsSplat[0] || InputIsSplat[1]) {
> +    NewMask.assign(BaseMask.begin(), BaseMask.end());
> +    for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
> +      int &M = NewMask[i];
> +      assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
> +             "OOB mask element?");
> +      if (M < 0)
> +        continue; // Keep the undef/zero mask elements as-is.
> +      int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
> +      // Is the used input wide-enough to contain that lane, and is it a splat?
> +      if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
> +        M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
> +    }
> +    BaseMask = std::move(NewMask);
> +  }
> +
>    // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
>    // etc. can be simplified.
>    if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
>
> diff  --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
> index a176edba13aa9..b542f173ae982 100644
> --- a/llvm/test/CodeGen/X86/avx.ll
> +++ b/llvm/test/CodeGen/X86/avx.ll
> @@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
>  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
>  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
>  ; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
> -; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
> -; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
> +; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
> +; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
>  ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
> -; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
> -; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
> +; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
> +; X32-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
>  ; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
>  ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
>  ; X32-NEXT:    retl
> @@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
>  ; X64-LABEL: insertps_from_broadcast_multiple_use:
>  ; X64:       ## %bb.0:
>  ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
> -; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
> -; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
> +; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
> +; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
>  ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
> -; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
> -; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
> +; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
> +; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
>  ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
>  ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
>  ; X64-NEXT:    retq
>
> diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> index 71682094d64e9..a763f92b76409 100644
> --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> @@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
>  ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
>  ; CHECK-FAST:       # %bb.0:
>  ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
> -; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
> +; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,2,7]
>  ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
>  ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
>  ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
> @@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
>  ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
>  ; CHECK-FAST:       # %bb.0:
>  ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
> -; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
> +; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,5,2,7]
>  ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
>  ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
>  ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
>
> diff  --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll
> index 71034f696429c..f957557130008 100644
> --- a/llvm/test/CodeGen/X86/pr15296.ll
> +++ b/llvm/test/CodeGen/X86/pr15296.ll
> @@ -26,28 +26,11 @@ allocas:
>  define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
>  ; CHECK-LABEL: shiftInput___canonical:
>  ; CHECK:       # %bb.0: # %allocas
> -; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
> -; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
> -; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
> -; CHECK-NEXT:    vpsrld %xmm2, %xmm3, %xmm4
> -; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm5
> -; CHECK-NEXT:    vpsrld %xmm5, %xmm3, %xmm6
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
> -; CHECK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
> -; CHECK-NEXT:    vpsrld %xmm6, %xmm3, %xmm7
> -; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
> -; CHECK-NEXT:    vpsrld %xmm1, %xmm3, %xmm3
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
> -; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
> -; CHECK-NEXT:    vpsrld %xmm5, %xmm0, %xmm4
> -; CHECK-NEXT:    vpsrld %xmm6, %xmm0, %xmm5
> -; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
> -; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
> -; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
> +; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
> +; CHECK-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
> +; CHECK-NEXT:    vpsrld %xmm2, %xmm1, %xmm1
> +; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
> +; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
>  ; CHECK-NEXT:    retl
>  allocas:
>    %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0
>
> diff  --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
> index 17aae3373c5ec..1a1b976c45403 100644
> --- a/llvm/test/CodeGen/X86/sse41.ll
> +++ b/llvm/test/CodeGen/X86/sse41.ll
> @@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
>  ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
>  ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
>  ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
> -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> -; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> -; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> +; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> +; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
>  ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
> -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> -; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> -; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
> +; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
> +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
> +; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
>  ; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
>  ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
>  ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
> @@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
>  ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
>  ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
>  ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
> -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> -; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> -; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> +; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> +; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
> +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
> +; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
> +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
> +; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
>  ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
> -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> -; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> -; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> -; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
> +; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
>  ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
>  ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
>  ;
> @@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
>  ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
>  ; X64-AVX1:       ## %bb.0:
>  ; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
> -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> -; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> -; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> +; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> +; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
>  ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
> -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> -; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> -; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
> +; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
> +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
> +; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
>  ; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
>  ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
>  ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
> @@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
>  ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
>  ; X64-AVX512:       ## %bb.0:
>  ; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
> -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> -; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> -; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> +; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> +; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
> +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
> +; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
> +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
> +; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
>  ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
> -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> -; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> -; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> -; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
> +; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
>  ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
>  ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
>    %1 = getelementptr inbounds float, float* %fb, i64 %index
>
> diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> index d280580f55f1b..96bcaa1a0d443 100644
> --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> @@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
>  ; AVX2:       # %bb.0:
>  ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
>  ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
> -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
>  ; AVX2-NEXT:    retq
>  ;
>  ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
>  ; AVX512VLBW:       # %bb.0:
>  ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
>  ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
> -; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
>  ; AVX512VLBW-NEXT:    retq
>  ;
>  ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
>
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits