[llvm] f819e4c - [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats

Thu Aug 5 10:37:15 PDT 2021

Thanks for the testcase!

On Thu, Aug 5, 2021 at 8:04 PM Benjamin Kramer <benny.kra at gmail.com> wrote:
>
> This causes infinite loops during compilation, reverted in
> https://github.com/llvm/llvm-project/commit/bd17ced1db9a674fc8aa6632899e245672c7aa35
>
> Test case:
>
> $ cat t.ll
> target datalayout =
> "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-unknown-linux-gnu"
>
> define void @MaxPoolGradGrad_1.65() local_unnamed_addr #0 {
> entry:
>   %wide.vec78 = load <64 x i32>, <64 x i32>* null, align 16
>   %strided.vec83 = shufflevector <64 x i32> %wide.vec78, <64 x i32>
> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32
> 52, i32 60>
>   %0 = lshr <8 x i32> %strided.vec83, <i32 16, i32 16, i32 16, i32 16,
> i32 16, i32 16, i32 16, i32 16>
>   %1 = add <8 x i32> zeroinitializer, %0
>   %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0,
> i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10,
> i32 11, i32 12, i32 13, i32 14, i32 15>
>   %3 = shufflevector <16 x i32> %2, <16 x i32> undef, <32 x i32> <i32
> 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32
> 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18,
> i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32
> 27, i32 28, i32 29, i32 30, i32 31>
>   %interleaved.vec = shufflevector <32 x i32> undef, <32 x i32> %3,
> <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32
> 56, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32
> 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 3, i32
> 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 4, i32 12, i32
> 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 5, i32 13, i32 21, i32
> 29, i32 37, i32 45, i32 53, i32 61, i32 6, i32 14, i32 22, i32 30, i32
> 38, i32 46, i32 54, i32 62, i32 7, i32 15, i32 23, i32 31, i32 39, i32
> 47, i32 55, i32 63>
>   store <64 x i32> %interleaved.vec, <64 x i32>* undef, align 16
>   unreachable
> }
>
> $ llc < t.ll -mcpu=skylake
> <hang>
>
> On Wed, Aug 4, 2021 at 3:55 PM Roman Lebedev via llvm-commits
> <llvm-commits at lists.llvm.org> wrote:
> >
> >
> > Author: Roman Lebedev
> > Date: 2021-08-04T16:55:04+03:00
> > New Revision: f819e4c7d0f6efef3cc1042cc45582320bf6c0a2
> >
> > URL: https://github.com/llvm/llvm-project/commit/f819e4c7d0f6efef3cc1042cc45582320bf6c0a2
> > DIFF: https://github.com/llvm/llvm-project/commit/f819e4c7d0f6efef3cc1042cc45582320bf6c0a2.diff
> >
> > LOG: [X86] combineX86ShuffleChain(): canonicalize mask elts picking from splats
> >
> > Given a shuffle mask, if it is picking from an input that is splat
> > given the current granularity of the shuffle, then adjust the mask
> > to pick from the same lane of the input as the mask element is in.
> > This may result in a shuffle being simplified into a blend.
> >
> > I believe this is correct given that the splat detection matches the one
> > just above the new code,
> >
> > My basic thought is that we might be able to get less regressions
> > by handling multiple insertions of the same value into a vector
> > if we form broadcasts+blend here, as opposed to D105390,
> > but i have not really thought this through,
> > and did not try implementing it yet.
> >
> > Reviewed By: RKSimon
> >
> > Differential Revision: https://reviews.llvm.org/D107009
> >
> > Added:
> >
> >
> > Modified:
> >     llvm/lib/Target/X86/X86ISelLowering.cpp
> >     llvm/test/CodeGen/X86/avx.ll
> >     llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> >     llvm/test/CodeGen/X86/pr15296.ll
> >     llvm/test/CodeGen/X86/sse41.ll
> >     llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> >
> > Removed:
> >
> >
> >
> > ################################################################################
> > diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
> > index 144c81b3ebebf..b435f13632c79 100644
> > --- a/llvm/lib/Target/X86/X86ISelLowering.cpp
> > +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
> > @@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
> >                       (RootVT.isFloatingPoint() && Depth >= 1) ||
> >                       (RootVT.is256BitVector() && !Subtarget.hasAVX2());
> >
> > +  // How many elements does each of the inputs have, given the current
> > +  // granularity of the root shuffle? Note that while currently the sizes of an
> > +  // inputs must match the size of the shuffle root,
> > +  // that restriction will be lifted in the future.
> > +  SmallVector<unsigned, 2> InputNumElts;
> > +  llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
> > +                  std::back_inserter(InputNumElts),
> > +                  [BaseMaskEltSizeInBits](MVT VT) {
> > +                    assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
> > +                           "Input is not a multiple of output element width?");
> > +                    return VT.getSizeInBits() / BaseMaskEltSizeInBits;
> > +                  });
> > +
> >    // Don't combine if we are a AVX512/EVEX target and the mask element size
> >    // is
> > diff erent from the root element size - this would prevent writemasks
> >    // from being reused.
> > @@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
> >    // If we are shuffling a broadcast (and not introducing zeros) then
> >    // we can just use the broadcast directly. This works for smaller broadcast
> >    // elements as well as they already repeat across each mask element
> > -  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
> > -      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
> > +  SmallVector<bool, 2> InputIsSplat;
> > +  llvm::transform(
> > +      std::initializer_list<SDValue>({V1, V2}),
> > +      std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
> > +        return isTargetShuffleSplat(V) &&
> > +               (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
> > +      });
> > +  if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
> >        V1.getValueSizeInBits() >= RootSizeInBits) {
> >      return CanonicalizeShuffleInput(RootVT, V1);
> >    }
> >
> > +  // Adjust mask elements that pick from a splat input to be identity mask elts,
> > +  // i.e. to pick from the same lane of the input as the mask element is in.
> > +  // This may allow to simplify the shuffle into a blend.
> > +  SmallVector<int> NewMask;
> > +  if (InputIsSplat[0] || InputIsSplat[1]) {
> > +    NewMask.assign(BaseMask.begin(), BaseMask.end());
> > +    for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
> > +      int &M = NewMask[i];
> > +      assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
> > +             "OOB mask element?");
> > +      if (M < 0)
> > +        continue; // Keep the undef/zero mask elements as-is.
> > +      int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
> > +      // Is the used input wide-enough to contain that lane, and is it a splat?
> > +      if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
> > +        M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
> > +    }
> > +    BaseMask = std::move(NewMask);
> > +  }
> > +
> >    // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
> >    // etc. can be simplified.
> >    if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
> >
> > diff  --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
> > index a176edba13aa9..b542f173ae982 100644
> > --- a/llvm/test/CodeGen/X86/avx.ll
> > +++ b/llvm/test/CodeGen/X86/avx.ll
> > @@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
> >  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
> >  ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
> >  ; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
> > -; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
> > -; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
> > +; X32-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
> > +; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
> >  ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
> > -; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
> > -; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
> > +; X32-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
> > +; X32-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
> >  ; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
> >  ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
> >  ; X32-NEXT:    retl
> > @@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
> >  ; X64-LABEL: insertps_from_broadcast_multiple_use:
> >  ; X64:       ## %bb.0:
> >  ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
> > -; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
> > -; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
> > +; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
> > +; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
> >  ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
> > -; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
> > -; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
> > +; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
> > +; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
> >  ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
> >  ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
> >  ; X64-NEXT:    retq
> >
> > diff  --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> > index 71682094d64e9..a763f92b76409 100644
> > --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> > +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
> > @@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double
> >  ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1:
> >  ; CHECK-FAST:       # %bb.0:
> >  ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
> > -; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,4,2,6]
> > +; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,2,7]
> >  ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3
> >  ; CHECK-FAST-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
> >  ; CHECK-FAST-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
> > @@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub
> >  ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1:
> >  ; CHECK-FAST:       # %bb.0:
> >  ; CHECK-FAST-NEXT:    vmovapd (%rdi), %ymm2
> > -; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,4,2,6]
> > +; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm1 = [3,5,2,7]
> >  ; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
> >  ; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm0, %k1
> >  ; CHECK-FAST-NEXT:    vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z}
> >
> > diff  --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll
> > index 71034f696429c..f957557130008 100644
> > --- a/llvm/test/CodeGen/X86/pr15296.ll
> > +++ b/llvm/test/CodeGen/X86/pr15296.ll
> > @@ -26,28 +26,11 @@ allocas:
> >  define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind {
> >  ; CHECK-LABEL: shiftInput___canonical:
> >  ; CHECK:       # %bb.0: # %allocas
> > -; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
> > -; CHECK-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
> > -; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm3
> > -; CHECK-NEXT:    vpsrld %xmm2, %xmm3, %xmm4
> > -; CHECK-NEXT:    vpsrlq $32, %xmm1, %xmm5
> > -; CHECK-NEXT:    vpsrld %xmm5, %xmm3, %xmm6
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
> > -; CHECK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7]
> > -; CHECK-NEXT:    vpsrld %xmm6, %xmm3, %xmm7
> > -; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
> > -; CHECK-NEXT:    vpsrld %xmm1, %xmm3, %xmm3
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7]
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
> > -; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
> > -; CHECK-NEXT:    vpsrld %xmm5, %xmm0, %xmm4
> > -; CHECK-NEXT:    vpsrld %xmm6, %xmm0, %xmm5
> > -; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7]
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
> > -; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
> > -; CHECK-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
> > +; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
> > +; CHECK-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
> > +; CHECK-NEXT:    vpsrld %xmm2, %xmm1, %xmm1
> > +; CHECK-NEXT:    vpsrld %xmm2, %xmm0, %xmm0
> > +; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
> >  ; CHECK-NEXT:    retl
> >  allocas:
> >    %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0
> >
> > diff  --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
> > index 17aae3373c5ec..1a1b976c45403 100644
> > --- a/llvm/test/CodeGen/X86/sse41.ll
> > +++ b/llvm/test/CodeGen/X86/sse41.ll
> > @@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
> >  ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
> >  ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
> >  ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
> > -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> > -; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> > -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> > -; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> > +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> > +; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> > +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> > +; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
> >  ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
> > -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> > -; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> > -; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> > -; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> > +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
> > +; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
> > +; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
> > +; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
> >  ; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
> >  ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
> >  ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
> > @@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
> >  ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
> >  ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
> >  ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
> > -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> > -; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> > -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> > -; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> > +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> > +; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> > +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> > +; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
> > +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
> > +; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
> > +; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
> > +; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
> >  ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
> > -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> > -; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> > -; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> > -; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> > -; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
> > +; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
> >  ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
> >  ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
> >  ;
> > @@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
> >  ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
> >  ; X64-AVX1:       ## %bb.0:
> >  ; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
> > -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> > -; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> > -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> > -; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> > +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> > +; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> > +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> > +; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
> >  ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
> > -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> > -; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> > -; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> > -; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> > +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
> > +; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
> > +; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
> > +; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
> >  ; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
> >  ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
> >  ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
> > @@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
> >  ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
> >  ; X64-AVX512:       ## %bb.0:
> >  ; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
> > -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
> > -; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
> > -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
> > -; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
> > +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
> > +; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
> > +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
> > +; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
> > +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
> > +; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
> > +; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
> > +; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
> >  ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
> > -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
> > -; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
> > -; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
> > -; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
> > -; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
> > +; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
> >  ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
> >  ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
> >    %1 = getelementptr inbounds float, float* %fb, i64 %index
> >
> > diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> > index d280580f55f1b..96bcaa1a0d443 100644
> > --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> > +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
> > @@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_
> >  ; AVX2:       # %bb.0:
> >  ; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
> >  ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
> > -; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> > +; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
> >  ; AVX2-NEXT:    retq
> >  ;
> >  ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
> >  ; AVX512VLBW:       # %bb.0:
> >  ; AVX512VLBW-NEXT:    vpbroadcastb %xmm1, %xmm1
> >  ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u]
> > -; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> > +; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
> >  ; AVX512VLBW-NEXT:    retq
> >  ;
> >  ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
> >
> >
> >
> > _______________________________________________
> > llvm-commits mailing list
> > llvm-commits at lists.llvm.org
> > https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits