[llvm] d2057a8 - [X86][AVX] Lower v16i8/v8i16 binary shuffles using VTRUNC/TRUNCATE
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 18 03:12:34 PDT 2020
Author: Simon Pilgrim
Date: 2020-08-18T11:11:58+01:00
New Revision: d2057a8015e97b5e212f3963923dbdcce9356a8f
URL: https://github.com/llvm/llvm-project/commit/d2057a8015e97b5e212f3963923dbdcce9356a8f
DIFF: https://github.com/llvm/llvm-project/commit/d2057a8015e97b5e212f3963923dbdcce9356a8f.diff
LOG: [X86][AVX] Lower v16i8/v8i16 binary shuffles using VTRUNC/TRUNCATE
This patch adds lowerShuffleWithVTRUNC to handle basic binary shuffles that can be lowered either as a pure ISD::TRUNCATE or a X86ISD::VTRUNC (with undef/zero values in the remaining upper elements).
We concat the binary sources together into a single 256-bit source vector. To avoid regressions we perform this after we've tried to lower with PACKS/PACKUS which typically does a cleaner job than a concat.
For non-AVX512VL cases we have to canonicalize VTRUNC cases to use a 512-bit source vectors (inserting undefs/zeros in the upper elements as necessary), truncate and then (possibly) extract the 128-bit result.
This should address the last regressions in D66004
Differential Revision: https://reviews.llvm.org/D86093
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/vector-trunc.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9c084a4a2cd2..a929df328e13 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11325,17 +11325,15 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
//
// But when avx512vl is available, one can just use a single vpmovdw
// instruction.
-static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
- MVT VT, SDValue V1, SDValue V2,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+// TODO: Merge with lowerShuffleAsVTRUNC.
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
-
- if (Mask.size() != VT.getVectorNumElements())
- return SDValue();
-
bool SwappedOps = false;
+ // TODO: Convert to use Zeroable bitmask.
if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
if (!ISD::isBuildVectorAllZeros(V1.getNode()))
return SDValue();
@@ -11378,6 +11376,73 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
}
+// Attempt to match binary shuffle patterns as a truncate.
+static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
+ assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / VT.getScalarSizeInBits();
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ // TODO: Support non-BWI VPMOVWB truncations?
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+
+ // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
+ // Bail if the V2 elements are undef.
+ unsigned NumHalfSrcElts = NumElts / Scale;
+ unsigned NumSrcElts = 2 * NumHalfSrcElts;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+ isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
+ continue;
+
+ // The elements beyond the truncation must be undef/zero.
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (UpperElts > 0 &&
+ !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+
+ // As we're using both sources then we need to concat them together
+ // and truncate from the 256-bit src.
+ MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
+ SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
+
+ MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+ MVT SrcVT = MVT::getVectorVT(SrcSVT, 256 / SrcEltBits);
+ Src = DAG.getBitcast(SrcVT, Src);
+
+ if (SrcVT.getVectorNumElements() == NumElts)
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Src);
+
+ if (!Subtarget.hasVLX()) {
+ // Non-VLX targets must truncate from a 512-bit type, so we need to
+ // widen, truncate and then possibly extract the original 128-bit
+ // vector.
+ bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
+ Src = widenSubVector(Src, !UndefUppers, Subtarget, DAG, DL, 512);
+ unsigned NumWideSrcElts = Src.getValueType().getVectorNumElements();
+ if (NumWideSrcElts >= NumElts) {
+ // Widening means we can now use a regular TRUNCATE.
+ MVT WideVT = MVT::getVectorVT(VT.getScalarType(), NumWideSrcElts);
+ SDValue WideRes = DAG.getNode(ISD::TRUNCATE, DL, WideVT, Src);
+ if (!WideVT.is128BitVector())
+ WideRes = extract128BitVector(WideRes, 0, DAG, DL);
+ return WideRes;
+ }
+ }
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+ }
+
+ return SDValue();
+}
+
/// Check whether a compaction lowering can be done by dropping even
/// elements and compute how many times even elements must be dropped.
///
@@ -14733,7 +14798,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use lower using a truncation.
if (SDValue V =
- lowerShuffleWithVPMOV(DL, Mask, MVT::v8i16, V1, V2, DAG, Subtarget))
+ lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
return V;
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
@@ -14816,6 +14881,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
+ // Try to use lower using a truncation.
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
+ return V;
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
@@ -14922,7 +14992,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use lower using a truncation.
if (SDValue V =
- lowerShuffleWithVPMOV(DL, Mask, MVT::v16i8, V1, V2, DAG, Subtarget))
+ lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
+ if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+ Subtarget, DAG))
return V;
// See if we can use SSE4A Extraction / Insertion.
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 9256a43f8e33..1559fdbbe72c 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -42,11 +42,10 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand 16(%rdi), %xmm0, %xmm1
-; AVX512BW-NEXT: vpand (%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
@@ -143,11 +142,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
@@ -159,11 +157,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
@@ -377,54 +374,42 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpmovdb %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vpmovdb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMIVL-NEXT: vpmovdb %xmm1, %xmm1
-; AVX512VBMIVL-NEXT: vpmovdb %xmm0, %xmm0
-; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -1081,49 +1066,42 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
;
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
-; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1199,54 +1177,42 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
;
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpmovqb %xmm1, %xmm1
-; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vpmovqb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMIVL-NEXT: vpmovqb %xmm1, %xmm1
-; AVX512VBMIVL-NEXT: vpmovqb %xmm0, %xmm0
-; AVX512VBMIVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 811fbe37497c..6c5f5125109d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -178,20 +178,17 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
@@ -211,20 +208,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
@@ -244,20 +238,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
@@ -293,44 +284,43 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
@@ -346,13 +336,14 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1
+; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
@@ -386,20 +377,17 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
@@ -415,20 +403,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
@@ -444,20 +429,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1
+; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index f0398b15d04a..d52e6195f8df 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1581,10 +1581,11 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
;
; AVX512F-LABEL: trunc2x4i32_8i16:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: trunc2x4i32_8i16:
@@ -1597,10 +1598,11 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
;
; AVX512BW-LABEL: trunc2x4i32_8i16:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x4i32_8i16:
@@ -1709,10 +1711,11 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
;
; AVX512BW-LABEL: trunc2x8i16_16i8:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc2x8i16_16i8:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index b80775ac7d57..047978c8a0da 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -383,33 +383,88 @@ ret void
}
define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
-; AVX-LABEL: interleaved_load_vf8_i8_stride4:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vmovdqa (%rdi), %xmm1
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vmovdqa (%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vmovdqa (%rdi), %xmm1
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vmovdqa (%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
%v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
@@ -529,10 +584,8 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6
-; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-NEXT: vpmovdb %zmm5, %xmm5
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
@@ -762,85 +815,83 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
;
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm3
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
+; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
+; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm14
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512-NEXT: vpmovdb %zmm5, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm0
-; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm6
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
+; AVX512-NEXT: vpmovdb %zmm3, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm0
+; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm6
-; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm7
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vmovdqa (%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vmovdqa (%rdi), %xmm12
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm6
-; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm6
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm5
+; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm6
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm4
+; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm2
-; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3
-; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6
-; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm2
-; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm1, %k1
; AVX512-NEXT: kxnord %k1, %k0, %k0
More information about the llvm-commits
mailing list