[llvm-branch-commits] [llvm] d5c434d - [X86][SSE] combineX86ShufflesRecursively - add basic handling for combining shuffles of different widths (PR45974)
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sun Dec 13 09:23:40 PST 2020
Author: Simon Pilgrim
Date: 2020-12-13T17:18:07Z
New Revision: d5c434d7dda25909cd7886e419baf3db3578953e
URL: https://github.com/llvm/llvm-project/commit/d5c434d7dda25909cd7886e419baf3db3578953e
DIFF: https://github.com/llvm/llvm-project/commit/d5c434d7dda25909cd7886e419baf3db3578953e.diff
LOG: [X86][SSE] combineX86ShufflesRecursively - add basic handling for combining shuffles of different widths (PR45974)
If a faux shuffle uses smaller shuffle inputs, try to recursively combine with those inputs directly instead of widening them immediately. Then widen all smaller inputs at the bottom of the recursion.
This will still mean we're generating nodes on the fly (PR45974) even if we don't combine to a new shuffle but it does help AVX2+ targets combine across xmm/ymm/zmm types, mainly as variable shuffles.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-v1.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 04987a2b9abe..b4a397080284 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36135,8 +36135,8 @@ static SDValue combineX86ShufflesRecursively(
if (!VT.isVector() || !VT.isSimple())
return SDValue(); // Bail if we hit a non-simple non-vector.
- assert(VT.getSizeInBits() == RootSizeInBits &&
- "Can only combine shuffles of the same vector register size.");
+ assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
+ "Can only combine shuffles upto size of the root op.");
// Extract target shuffle mask and resolve sentinels and inputs.
// TODO - determine Op's demanded elts from RootMask.
@@ -36149,17 +36149,32 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
- // Shuffle inputs must be the same size as the result, bail on any larger
- // inputs and widen any smaller inputs.
- if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
- return Op.getValueSizeInBits() > RootSizeInBits;
+ // Shuffle inputs must not be larger than the shuffle result.
+ // TODO: Relax this for single input faux shuffles (trunc/extract_subvector).
+ if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
+ return OpInput.getValueSizeInBits() > VT.getSizeInBits();
}))
return SDValue();
- for (SDValue &Op : OpInputs)
- if (Op.getValueSizeInBits() < RootSizeInBits)
- Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
- SDLoc(Op), RootSizeInBits);
+ // If the shuffle result was smaller than the root, we need to adjust the
+ // mask indices and pad the mask with undefs.
+ if (RootSizeInBits > VT.getSizeInBits()) {
+ unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
+ unsigned OpMaskSize = OpMask.size();
+ if (OpInputs.size() > 1) {
+ unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
+ for (int &M : OpMask) {
+ if (M < 0)
+ continue;
+ int EltIdx = M % OpMaskSize;
+ int OpIdx = M / OpMaskSize;
+ M = (PaddedMaskSize * OpIdx) + EltIdx;
+ }
+ }
+ OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
+ OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
+ OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
+ }
SmallVector<int, 64> Mask;
SmallVector<SDValue, 16> Ops;
@@ -36337,6 +36352,18 @@ static SDValue combineX86ShufflesRecursively(
}
}
+ // Widen any subvector shuffle inputs we've collected.
+ if (any_of(Ops, [RootSizeInBits](SDValue Op) {
+ return Op.getValueSizeInBits() < RootSizeInBits;
+ })) {
+ for (SDValue &Op : Ops)
+ if (Op.getValueSizeInBits() < RootSizeInBits)
+ Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
+ RootSizeInBits);
+ // Reresolve - we might have repeated subvector sources.
+ resolveTargetShuffleInputsAndMask(Ops, Mask);
+ }
+
// Attempt to constant fold all of the constant source ops.
if (SDValue Cst = combineX86ShufflesConstants(
Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index a39fbf878fd9..5456cd2e753a 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1682,25 +1682,45 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-leg
}
define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-legal-vector-width"="256" {
-; CHECK-LABEL: splatvar_rotate_v32i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastb %xmm1, %xmm1
-; CHECK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpsllw %xmm2, %ymm0, %ymm3
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; CHECK-NEXT: vpsubb %xmm1, %xmm4, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vpsllw %xmm2, %xmm4, %xmm2
-; CHECK-NEXT: vpbroadcastb %xmm2, %ymm2
-; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm5
-; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm2
-; CHECK-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
-; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0
-; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
-; CHECK-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0
-; CHECK-NEXT: retq
+; CHECK-AVX512-LABEL: splatvar_rotate_v32i8:
+; CHECK-AVX512: # %bb.0:
+; CHECK-AVX512-NEXT: vpbroadcastb %xmm1, %xmm1
+; CHECK-AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm3
+; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-AVX512-NEXT: vpsubb %xmm1, %xmm4, %xmm1
+; CHECK-AVX512-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; CHECK-AVX512-NEXT: vpsllw %xmm2, %xmm4, %xmm2
+; CHECK-AVX512-NEXT: vpbroadcastb %xmm2, %ymm2
+; CHECK-AVX512-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm5
+; CHECK-AVX512-NEXT: vpand %ymm2, %ymm3, %ymm2
+; CHECK-AVX512-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; CHECK-AVX512-NEXT: vpsrlw $8, %xmm0, %xmm0
+; CHECK-AVX512-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-AVX512-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0
+; CHECK-AVX512-NEXT: retq
+;
+; CHECK-VBMI-LABEL: splatvar_rotate_v32i8:
+; CHECK-VBMI: # %bb.0:
+; CHECK-VBMI-NEXT: vpbroadcastb %xmm1, %xmm1
+; CHECK-VBMI-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-VBMI-NEXT: vpsllw %xmm2, %ymm0, %ymm3
+; CHECK-VBMI-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; CHECK-VBMI-NEXT: vpsllw %xmm2, %xmm4, %xmm2
+; CHECK-VBMI-NEXT: vpbroadcastb %xmm2, %ymm2
+; CHECK-VBMI-NEXT: vpand %ymm2, %ymm3, %ymm2
+; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-VBMI-NEXT: vpsubb %xmm1, %xmm3, %xmm1
+; CHECK-VBMI-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-VBMI-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
+; CHECK-VBMI-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-VBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; CHECK-VBMI-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
+; CHECK-VBMI-NEXT: retq
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
%shl = shl <32 x i8> %a, %splat
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 5eb4b1039bf9..19ea28085d75 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -59,18 +59,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00:
; XOPAVX1: # %bb.0:
@@ -115,18 +108,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00:
; XOPAVX1: # %bb.0:
@@ -171,18 +157,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00:
; XOPAVX1: # %bb.0:
@@ -218,17 +197,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -262,17 +235,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -306,17 +273,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -350,17 +311,11 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -1040,19 +995,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_0
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00:
; XOPAVX1: # %bb.0:
@@ -1094,19 +1042,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_0
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,0,2,0,0,0,0,0,0,0,2,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00:
; XOPAVX1: # %bb.0:
@@ -1148,19 +1089,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_0
; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00:
; XOPAVX1: # %bb.0:
@@ -1192,18 +1126,12 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0,0,0,0,4,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -1233,18 +1161,12 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -1274,18 +1196,12 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0,0,6,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
@@ -1315,18 +1231,12 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX2-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
-; AVX512VL-FAST-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512VL-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; XOPAVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 4c8c8f6312f1..7bbff8767922 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -272,11 +272,23 @@ define <8 x float> @shuffle_v8f32_08080808(<8 x float> %a, <8 x float> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8f32_08080808:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8f32_08080808:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_08080808:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8f32_08080808:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368]
+; AVX512VL-FAST-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x float> %shuffle
}
@@ -1617,11 +1629,23 @@ define <8 x i32> @shuffle_v8i32_08080808(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_08080808:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2OR512VL-NEXT: vbroadcastsd %xmm0, %ymm0
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i32_08080808:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_08080808:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-LABEL: shuffle_v8i32_08080808:
+; AVX512VL-FAST: # %bb.0:
+; AVX512VL-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [34359738368,34359738368,34359738368,34359738368]
+; AVX512VL-FAST-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x i32> %shuffle
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 6d8306bf01bb..015f2e9bfe24 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -500,8 +500,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; AVX512VL-NEXT: kmovw %edi, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2]
+; AVX512VL-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX512VL-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
@@ -513,8 +513,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; VL_BW_DQ: # %bb.0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0
-; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %ymm0
+; VL_BW_DQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
+; VL_BW_DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0
; VL_BW_DQ-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index a540d04626ae..42808b3910e4 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -810,85 +810,79 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
;
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
-; AVX512-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
-; AVX512-NEXT: vpmovdb %zmm3, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm0
-; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm0
+; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm4
+; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vmovdqa (%rdi), %xmm13
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm5
-; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2
+; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm5
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm6
-; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm4
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm2
; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0
-; AVX512-NEXT: vpcmpeqb %zmm0, %zmm1, %k1
+; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1
; AVX512-NEXT: kxnord %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
More information about the llvm-branch-commits
mailing list