[llvm] c6e5531 - [X86][AVX] Combine shuffles to TRUNCATE/VTRUNC patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 25 10:42:11 PDT 2020
Author: Simon Pilgrim
Date: 2020-03-25T17:41:51Z
New Revision: c6e5531f9b56dcc205a0cad44099e1bf8f31b27c
URL: https://github.com/llvm/llvm-project/commit/c6e5531f9b56dcc205a0cad44099e1bf8f31b27c
DIFF: https://github.com/llvm/llvm-project/commit/c6e5531f9b56dcc205a0cad44099e1bf8f31b27c.diff
LOG: [X86][AVX] Combine shuffles to TRUNCATE/VTRUNC patterns
Add support for combining shuffles to AVX512 truncate instructions - another step toward fixing D56387/D66004. It also fixes SKX code on PR31443.
We could probably extend this further to handle non-VLX truncation cases.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-trunc.ll
llvm/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
llvm/test/CodeGen/X86/vec_fp_to_int.ll
llvm/test/CodeGen/X86/vector-reduce-mul.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-trunc.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1a9e5b4c5d35..bff895d85afe 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7498,6 +7498,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
createPackShuffleMask(VT, Mask, IsUnary);
return true;
}
+ case ISD::TRUNCATE:
+ case X86ISD::VTRUNC: {
+ SDValue Src = N.getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
+ unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+ unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
+ assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
+ for (unsigned i = 0; i != NumSrcElts; ++i)
+ Mask.push_back(i * Scale);
+ Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
+ Ops.push_back(Src);
+ return true;
+ }
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
uint64_t ShiftVal = N.getConstantOperandVal(1);
@@ -11062,6 +11076,45 @@ static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
}
+// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
+// source into the lower elements and zeroing the upper elements.
+// TODO: Merge with matchShuffleAsVPMOV.
+static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
+ ArrayRef<int> Mask, const APInt &Zeroable,
+ const X86Subtarget &Subtarget) {
+ if (!VT.is512BitVector() && !Subtarget.hasVLX())
+ return false;
+
+ unsigned NumElts = Mask.size();
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
+ unsigned MaxScale = 64 / EltSizeInBits;
+
+ for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+ unsigned SrcEltBits = EltSizeInBits * Scale;
+ if (SrcEltBits < 32 && !Subtarget.hasBWI())
+ continue;
+ unsigned NumSrcElts = NumElts / Scale;
+ if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
+ continue;
+ unsigned UpperElts = NumElts - NumSrcElts;
+ if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+ continue;
+ SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
+ SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
+ DstVT = MVT::getIntegerVT(EltSizeInBits);
+ if ((NumSrcElts * EltSizeInBits) >= 128) {
+ // ISD::TRUNCATE
+ DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
+ } else {
+ // X86ISD::VTRUNC
+ DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
+ }
+ return true;
+ }
+
+ return false;
+}
+
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
@@ -33192,11 +33245,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
return VTBits;
case X86ISD::VTRUNC: {
- // TODO: Add DemandedElts support.
SDValue Src = Op.getOperand(0);
- unsigned NumSrcBits = Src.getScalarValueSizeInBits();
+ MVT SrcVT = Src.getSimpleValueType();
+ unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
assert(VTBits < NumSrcBits && "Illegal truncation input type");
- unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
if (Tmp > (NumSrcBits - VTBits))
return Tmp - (NumSrcBits - VTBits);
return 1;
@@ -34094,6 +34148,43 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
}
}
+ // Match shuffle against TRUNCATE patterns.
+ if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
+ // Match against a VTRUNC instruction, accounting for src/dst sizes.
+ if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
+ Subtarget)) {
+ bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
+ ShuffleSrcVT.getVectorNumElements();
+ unsigned Opc = IsTRUNCATE ? ISD::TRUNCATE : X86ISD::VTRUNC;
+ if (Depth == 0 && Root.getOpcode() == Opc)
+ return SDValue(); // Nothing to do!
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
+ if (ShuffleVT.getSizeInBits() < RootSizeInBits)
+ Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
+ return DAG.getBitcast(RootVT, Res);
+ }
+
+ // Do we need a more general binary truncation pattern?
+ if (RootSizeInBits < 512 &&
+ ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
+ (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
+ (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
+ isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
+ if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+ return SDValue(); // Nothing to do!
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+ V2 = DAG.getBitcast(ShuffleSrcVT, V2);
+ ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+ ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
+ Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
+ return DAG.getBitcast(RootVT, Res);
+ }
+ }
+
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 1)
diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll
index bf49987aba4f..171fd45419b3 100644
--- a/llvm/test/CodeGen/X86/avx512-trunc.ll
+++ b/llvm/test/CodeGen/X86/avx512-trunc.ll
@@ -91,10 +91,15 @@ define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
}
define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
-; ALL-LABEL: trunc_qb_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; KNL-LABEL: trunc_qb_128:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_qb_128:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpmovqb %xmm0, %xmm0
+; SKX-NEXT: retq
%x = trunc <2 x i64> %i to <2 x i8>
ret <2 x i8> %x
}
@@ -344,10 +349,15 @@ define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
}
define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
-; ALL-LABEL: trunc_db_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; KNL-LABEL: trunc_db_128:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_db_128:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpmovdb %xmm0, %xmm0
+; SKX-NEXT: retq
%x = trunc <4 x i32> %i to <4 x i8>
ret <4 x i8> %x
}
@@ -537,10 +547,15 @@ define <16 x i8> @trunc_wb_256_mem_and_ret(<16 x i16> %i, <16 x i8>* %res) #0 {
}
define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
-; ALL-LABEL: trunc_wb_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
+; KNL-LABEL: trunc_wb_128:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT: retq
+;
+; SKX-LABEL: trunc_wb_128:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpmovwb %xmm0, %xmm0
+; SKX-NEXT: retq
%x = trunc <8 x i16> %i to <8 x i8>
ret <8 x i8> %x
}
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index 8b9a7a03d502..c921a90ff10e 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1554,9 +1554,7 @@ define <8 x i64> @cmp_swap_bug(<16 x i8>* %x, <8 x i64> %y, <8 x i64> %z) {
; SKX-LABEL: cmp_swap_bug:
; SKX: ## %bb.0: ## %entry
; SKX-NEXT: vmovdqa (%rdi), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x17]
-; SKX-NEXT: vpshufb {{.*}}(%rip), %xmm2, %xmm2 ## EVEX TO VEX Compression xmm2 = xmm2[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SKX-NEXT: ## encoding: [0xc4,0xe2,0x69,0x00,0x15,A,A,A,A]
-; SKX-NEXT: ## fixup A - offset: 5, value: LCPI69_0-4, kind: reloc_riprel_4byte
+; SKX-NEXT: vpmovwb %xmm2, %xmm2 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xd2]
; SKX-NEXT: vpmovb2m %xmm2, %k1 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xca]
; SKX-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x64,0xc0]
; SKX-NEXT: retq ## encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
index effa39bb8f1f..11dd987729d9 100644
--- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
@@ -736,7 +736,7 @@ define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
define <2 x i64> @test_mm_cvtepi16_epi8(<2 x i64> %__A) {
; CHECK-LABEL: test_mm_cvtepi16_epi8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovwb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%0 = bitcast <2 x i64> %__A to <8 x i16>
diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
index 3386e4043da7..0e48424a2d25 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -3236,7 +3236,7 @@ entry:
define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) {
; CHECK-LABEL: test_mm_cvtepi32_epi8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovdb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%0 = bitcast <2 x i64> %__A to <4 x i32>
@@ -3249,7 +3249,7 @@ entry:
define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) {
; CHECK-LABEL: test_mm_cvtepi32_epi16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovdw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%0 = bitcast <2 x i64> %__A to <4 x i32>
@@ -3262,7 +3262,7 @@ entry:
define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) {
; CHECK-LABEL: test_mm_cvtepi64_epi8:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovqb %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%conv.i = trunc <2 x i64> %__A to <2 x i8>
@@ -3274,7 +3274,7 @@ entry:
define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) {
; CHECK-LABEL: test_mm_cvtepi64_epi16:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT: vpmovqw %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%conv.i = trunc <2 x i64> %__A to <2 x i16>
@@ -3286,7 +3286,7 @@ entry:
define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) {
; CHECK-LABEL: test_mm_cvtepi64_epi32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; CHECK-NEXT: vpmovqd %xmm0, %xmm0
; CHECK-NEXT: ret{{[l|q]}}
entry:
%conv.i = trunc <2 x i64> %__A to <2 x i32>
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 7805c46e5c34..87e22321a3fc 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -37,12 +37,33 @@ define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX-NEXT: vmovq %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovq %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v8i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0
+; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i8> %strided.vec, <8 x i8>* %S
@@ -286,12 +307,33 @@ define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX-NEXT: vmovd %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovd %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v4i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
store <4 x i8> %strided.vec, <4 x i8>* %S
@@ -503,12 +545,33 @@ define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
+; AVX512VL-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v16i8_to_v2i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512BWVL-NEXT: retq
%vec = load <16 x i8>, <16 x i8>* %L
%strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
store <2 x i8> %strided.vec, <2 x i8>* %S
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 01890e2adba9..e0545c8a584d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -59,21 +59,16 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -169,13 +164,9 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
@@ -191,18 +182,16 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
;
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <16 x i16>, <16 x i16>* %L
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -282,12 +271,40 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
; AVX-NEXT: vmovaps %xmm0, (%rsi)
; AVX-NEXT: retq
;
-; AVX512-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512-NEXT: vmovaps %xmm0, (%rsi)
-; AVX512-NEXT: retq
+; AVX512F-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps (%rdi), %xmm0
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512F-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
+; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+;
+; AVX512VBMIVL-LABEL: shuffle_v8i32_to_v4i32:
+; AVX512VBMIVL: # %bb.0:
+; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
+; AVX512VBMIVL-NEXT: vzeroupper
+; AVX512VBMIVL-NEXT: retq
%vec = load <8 x i32>, <8 x i32>* %L
%strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
store <4 x i32> %strided.vec, <4 x i32>* %S
@@ -387,9 +404,8 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: retq
@@ -409,9 +425,8 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovdb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -419,9 +434,11 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VBMIVL-NEXT: vpmovdb %xmm1, %xmm1
+; AVX512VBMIVL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -1278,9 +1295,8 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovqb %xmm1, %xmm1
+; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VL-NEXT: retq
@@ -1300,9 +1316,8 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovqb %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512BWVL-NEXT: retq
@@ -1310,9 +1325,11 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VBMIVL-NEXT: vpmovqb %xmm1, %xmm1
+; AVX512VBMIVL-NEXT: vpmovqb %xmm0, %xmm0
+; AVX512VBMIVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi)
; AVX512VBMIVL-NEXT: retq
%vec = load <32 x i8>, <32 x i8>* %L
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 7f00b49b81f8..d0e8dc210051 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -37,46 +37,29 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
;
; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
-; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
-; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
+; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
@@ -139,74 +122,12 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
+; AVX512-LABEL: shuffle_v32i16_to_v16i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
store <16 x i16> %strided.vec, <16 x i16>* %S
@@ -228,59 +149,12 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
}
define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
-; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
+; AVX512-LABEL: shuffle_v16i32_to_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%vec = load <16 x i32>, <16 x i32>* %L
%strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
store <8 x i32> %strided.vec, <8 x i32>* %S
@@ -322,20 +196,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512VL-NEXT: vpmovdb %ymm1, %xmm1
+; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
@@ -358,20 +229,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpmovdb %ymm1, %xmm1
+; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index 0d5566028e8e..ed2db51a8ad5 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -1587,7 +1587,7 @@ define <2 x i8> @strict_vector_fptosi_v2f64_to_v2i8(<2 x double> %a) #0 {
; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: ret{{[l|q]}}
;
; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8:
@@ -1599,7 +1599,7 @@ define <2 x i8> @strict_vector_fptosi_v2f64_to_v2i8(<2 x double> %a) #0 {
; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLDQ-NEXT: ret{{[l|q]}}
%ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f64(<2 x double> %a,
metadata !"fpexcept.strict")
@@ -1638,7 +1638,7 @@ define <2 x i8> @strict_vector_fptoui_v2f64_to_v2i8(<2 x double> %a) #0 {
; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: ret{{[l|q]}}
;
; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8:
@@ -1650,7 +1650,7 @@ define <2 x i8> @strict_vector_fptoui_v2f64_to_v2i8(<2 x double> %a) #0 {
; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLDQ-NEXT: ret{{[l|q]}}
%ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f64(<2 x double> %a,
metadata !"fpexcept.strict")
@@ -1694,7 +1694,7 @@ define <2 x i8> @strict_vector_fptosi_v2f32_to_v2i8(<2 x float> %a) #0 {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: ret{{[l|q]}}
;
; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i8:
@@ -1708,7 +1708,7 @@ define <2 x i8> @strict_vector_fptosi_v2f32_to_v2i8(<2 x float> %a) #0 {
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLDQ-NEXT: ret{{[l|q]}}
%ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f32(<2 x float> %a,
metadata !"fpexcept.strict")
@@ -1752,7 +1752,7 @@ define <2 x i8> @strict_vector_fptoui_v2f32_to_v2i8(<2 x float> %a) #0 {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: ret{{[l|q]}}
;
; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i8:
@@ -1766,7 +1766,7 @@ define <2 x i8> @strict_vector_fptoui_v2f32_to_v2i8(<2 x float> %a) #0 {
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLDQ-NEXT: ret{{[l|q]}}
%ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f32(<2 x float> %a,
metadata !"fpexcept.strict")
@@ -2666,7 +2666,7 @@ define <4 x i8> @strict_vector_fptosi_v4f32_to_v4i8(<4 x float> %a) #0 {
; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: ret{{[l|q]}}
;
; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8:
@@ -2678,7 +2678,7 @@ define <4 x i8> @strict_vector_fptosi_v4f32_to_v4i8(<4 x float> %a) #0 {
; AVX512VLDQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLDQ-NEXT: ret{{[l|q]}}
%ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f32(<4 x float> %a,
metadata !"fpexcept.strict")
@@ -2717,7 +2717,7 @@ define <4 x i8> @strict_vector_fptoui_v4f32_to_v4i8(<4 x float> %a) #0 {
; AVX512VL-LABEL: strict_vector_fptoui_v4f32_to_v4i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VL-NEXT: ret{{[l|q]}}
;
; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8:
@@ -2729,7 +2729,7 @@ define <4 x i8> @strict_vector_fptoui_v4f32_to_v4i8(<4 x float> %a) #0 {
; AVX512VLDQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8:
; AVX512VLDQ: # %bb.0:
; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
; AVX512VLDQ-NEXT: ret{{[l|q]}}
%ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f32(<4 x float> %a,
metadata !"fpexcept.strict")
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index 9c704b1d526f..0f9fc8dcb6af 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -1247,24 +1247,80 @@ define <4 x i16> @strict_vector_fptoui_v4f64_to_v4i16(<4 x double> %a) #0 {
}
define <4 x i8> @strict_vector_fptosi_v4f64_to_v4i8(<4 x double> %a) #0 {
-; CHECK-LABEL: strict_vector_fptosi_v4f64_to_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: ret{{[l|q]}}
+; AVX-LABEL: strict_vector_fptosi_v4f64_to_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512F-LABEL: strict_vector_fptosi_v4f64_to_v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: ret{{[l|q]}}
+;
+; AVX512VL-LABEL: strict_vector_fptosi_v4f64_to_v4i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: ret{{[l|q]}}
+;
+; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: ret{{[l|q]}}
+;
+; AVX512DQVL-LABEL: strict_vector_fptosi_v4f64_to_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: ret{{[l|q]}}
%ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f64(<4 x double> %a,
metadata !"fpexcept.strict")
ret <4 x i8> %ret
}
define <4 x i8> @strict_vector_fptoui_v4f64_to_v4i8(<4 x double> %a) #0 {
-; CHECK-LABEL: strict_vector_fptoui_v4f64_to_v4i8:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0
-; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: ret{{[l|q]}}
+; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: ret{{[l|q]}}
+;
+; AVX512F-LABEL: strict_vector_fptoui_v4f64_to_v4i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: ret{{[l|q]}}
+;
+; AVX512VL-LABEL: strict_vector_fptoui_v4f64_to_v4i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: ret{{[l|q]}}
+;
+; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vzeroupper
+; AVX512DQ-NEXT: ret{{[l|q]}}
+;
+; AVX512DQVL-LABEL: strict_vector_fptoui_v4f64_to_v4i8:
+; AVX512DQVL: # %bb.0:
+; AVX512DQVL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; AVX512DQVL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512DQVL-NEXT: vzeroupper
+; AVX512DQVL-NEXT: ret{{[l|q]}}
%ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f64(<4 x double> %a,
metadata !"fpexcept.strict")
ret <4 x i8> %ret
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index d412a48c03b5..e42742fd008d 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -2302,11 +2302,35 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: fptosi_2f32_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptosi_2f32_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptosi_2f32_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptosi_2f32_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2336,11 +2360,35 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: fptoui_2f32_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptoui_2f32_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttps2dq %xmm0, %xmm0
+; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptoui_2f32_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptoui_2f32_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x float> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2370,11 +2418,35 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: fptosi_2f64_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptosi_2f64_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptosi_2f64_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptosi_2f64_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptosi <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
}
@@ -2404,11 +2476,35 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: fptoui_2f64_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; VEX-LABEL: fptoui_2f64_to_2i8:
+; VEX: # %bb.0:
+; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0
+; VEX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; VEX-NEXT: retq
+;
+; AVX512F-LABEL: fptoui_2f64_to_2i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: fptoui_2f64_to_2i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512DQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: retq
+;
+; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8:
+; AVX512VLDQ: # %bb.0:
+; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
+; AVX512VLDQ-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VLDQ-NEXT: retq
%cvt = fptoui <2 x double> %a to <2 x i8>
ret <2 x i8> %cvt
}
diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
index 9bdd722f6402..ca78dad09768 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll
@@ -2376,24 +2376,21 @@ define i8 @test_v64i8(<64 x i8> %a0) {
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %ymm2, %xmm2
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovd %xmm0, %eax
@@ -2794,24 +2791,21 @@ define i8 @test_v128i8(<128 x i8> %a0) {
; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BWVL-NEXT: vpmovwb %ymm2, %xmm2
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
; AVX512BWVL-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovd %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index c9329251de31..f3b35423bd7a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -973,10 +973,15 @@ define <16 x i8> @trunc_v4i32_shuffle(<16 x i8> %a) {
; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: retq
;
-; AVX-LABEL: trunc_v4i32_shuffle:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: trunc_v4i32_shuffle:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: trunc_v4i32_shuffle:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
ret <16 x i8> %shuffle
}
@@ -2004,19 +2009,13 @@ define <16 x i8> @PR12412(<16 x i8> %inval1, <16 x i8> %inval2) {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: retq
;
-; AVX512VLBW-LABEL: PR12412:
-; AVX512VLBW: # %bb.0: # %entry
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VLBW-NEXT: retq
-;
-; AVX512VLVBMI-LABEL: PR12412:
-; AVX512VLVBMI: # %bb.0: # %entry
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0
-; AVX512VLVBMI-NEXT: retq
+; AVX512VL-LABEL: PR12412:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
;
; XOP-LABEL: PR12412:
; XOP: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 343fd75044e7..3ad9ff10f2ba 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4827,27 +4827,12 @@ define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
-; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX512VLBW-SLOW: # %bb.0:
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VLBW-SLOW-NEXT: retq
-;
-; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX512VLBW-FAST: # %bb.0:
-; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,5,7]
-; AVX512VLBW-FAST-NEXT: vpermi2q %ymm1, %ymm2, %ymm0
-; AVX512VLBW-FAST-NEXT: retq
-;
-; AVX512VLVBMI-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
-; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
-; AVX512VLVBMI-NEXT: retq
+; AVX512VL-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; XOPAVX1: # %bb.0:
@@ -4894,10 +4879,10 @@ define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
;
; AVX512VLBW-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VLBW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VLBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 99a5c5f63d7d..f3c3e140c2f9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -3133,8 +3133,9 @@ define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-SLOW-LABEL: add_v8i32_02468ACE_13579BDF:
; AVX512VL-SLOW: # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX512VL-SLOW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VL-SLOW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2
+; AVX512VL-SLOW-NEXT: vpmovqd %zmm2, %ymm2
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0
@@ -3142,8 +3143,9 @@ define <8 x i32> @add_v8i32_02468ACE_13579BDF(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-FAST-LABEL: add_v8i32_02468ACE_13579BDF:
; AVX512VL-FAST: # %bb.0: # %entry
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512VL-FAST-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VL-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2
+; AVX512VL-FAST-NEXT: vpmovqd %zmm2, %ymm2
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
; AVX512VL-FAST-NEXT: vpermi2d %ymm1, %ymm0, %ymm3
; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0
@@ -3180,8 +3182,9 @@ define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-SLOW-LABEL: add_v8i32_8ACE0246_9BDF1357:
; AVX512VL-SLOW: # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
+; AVX512VL-SLOW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VL-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2
+; AVX512VL-SLOW-NEXT: vpmovqd %zmm2, %ymm2
; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-SLOW-NEXT: vpaddd %ymm0, %ymm2, %ymm0
@@ -3189,8 +3192,9 @@ define <8 x i32> @add_v8i32_8ACE0246_9BDF1357(<8 x i32> %a, <8 x i32> %b) {
;
; AVX512VL-FAST-LABEL: add_v8i32_8ACE0246_9BDF1357:
; AVX512VL-FAST: # %bb.0: # %entry
-; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512VL-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VL-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm2
+; AVX512VL-FAST-NEXT: vpmovqd %zmm2, %ymm2
; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,9,11,13,15]
; AVX512VL-FAST-NEXT: vpermi2d %ymm0, %ymm1, %ymm3
; AVX512VL-FAST-NEXT: vpaddd %ymm3, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 94b08c9abb82..87235ed9c69d 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1556,10 +1556,31 @@ define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX-NEXT: retq
;
-; AVX512-LABEL: trunc2x2i64_4i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc2x2i64_4i32:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc2x2i64_4i32:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc2x2i64_4i32:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc2x2i64_4i32:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <2 x i64> %a to <2 x i32>
%1 = trunc <2 x i64> %b to <2 x i32>
@@ -1637,10 +1658,10 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
;
; AVX512VL-LABEL: trunc2x4i32_8i16:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: trunc2x4i32_8i16:
@@ -1653,8 +1674,10 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
;
; AVX512BWVL-LABEL: trunc2x4i32_8i16:
; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <4 x i32> %a to <4 x i16>
@@ -1735,13 +1758,37 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
;
-; AVX512-LABEL: trunc2x8i16_16i8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc2x8i16_16i8:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc2x8i16_16i8:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc2x8i16_16i8:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc2x8i16_16i8:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <8 x i16> %a to <8 x i8>
%1 = trunc <8 x i16> %b to <8 x i8>
@@ -1776,11 +1823,29 @@ define i64 @trunc8i16_i64(<8 x i16> %inval) {
; AVX-NEXT: vmovq %xmm0, %rax
; AVX-NEXT: retq
;
-; AVX512-LABEL: trunc8i16_i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: trunc8i16_i64:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: trunc8i16_i64:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: trunc8i16_i64:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovq %xmm0, %rax
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: trunc8i16_i64:
+; AVX512BWVL: # %bb.0: # %entry
+; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0
+; AVX512BWVL-NEXT: vmovq %xmm0, %rax
+; AVX512BWVL-NEXT: retq
entry:
%0 = trunc <8 x i16> %inval to <8 x i8>
%1 = bitcast <8 x i8> %0 to i64
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index e557a9d9e162..5209723ceadb 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -774,94 +774,89 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
;
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vmovdqa (%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm4
-; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm5
-; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm7
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
-; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = mem[2,3,0,1]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm6
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm9
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = mem[2,3,0,1]
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm10
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = mem[2,3,0,1]
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
+; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm6
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = mem[2,3,0,1]
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm13
+; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512-NEXT: vpmovdb %zmm5, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1
-; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm2
+; AVX512-NEXT: vmovdqa (%rdi), %xmm14
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm4
+; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm1
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm2
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm3
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1
-; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm2
+; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm1
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm2
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm4
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm3
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm2
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm0
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm1
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm2
-; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vpcmpeqb %zmm9, %zmm8, %k0
-; AVX512-NEXT: vpcmpeqb %zmm0, %zmm15, %k1
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0
+; AVX512-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
; AVX512-NEXT: kxnord %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
More information about the llvm-commits
mailing list