[llvm] [X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded (PR #133923)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 2 04:05:05 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/133923
>From 1070f6baea62960df10197d6327c23cecc2a27e8 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 1 Apr 2025 16:18:22 +0100
Subject: [PATCH] [X86] SimplifyDemandedVectorEltsForTargetNode - reduce the
size of VPERMV/VPERMV3 nodes if the upper elements are not demanded
With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements.
This exposed an issue with VPERMV3(X,M,Y) -> VPERMV(M,CONCAT(X,Y)) folds when X==Y, so I had to move that fold after the other VPERMV3 folds/canonicalizations.
I also took the opportunity to try to support the VPERMV(M,CONCAT(Y,X)) case as well, but we can revert this if we'd prefer to avoid the extra VSHUFF64X2 node for non-constant shuffle masks (but separate loads) instead.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 124 +++++++++++++-----
.../any_extend_vector_inreg_of_broadcast.ll | 46 +++----
...d_vector_inreg_of_broadcast_from_memory.ll | 8 +-
llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 12 +-
.../vector-interleaved-load-i16-stride-5.ll | 76 ++++++-----
.../vector-interleaved-store-i64-stride-5.ll | 32 ++---
.../vector-interleaved-store-i64-stride-6.ll | 96 +++++++-------
.../X86/vector-shuffle-combining-avx512vl.ll | 14 +-
.../zero_extend_vector_inreg_of_broadcast.ll | 22 ++--
...d_vector_inreg_of_broadcast_from_memory.ll | 8 +-
10 files changed, 256 insertions(+), 182 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8e6a891444bf1..7bf197fa5f261 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42675,40 +42675,10 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
return SDValue();
}
case X86ISD::VPERMV3: {
- // Combine VPERMV3 to widened VPERMV if the two source operands can be
- // freely concatenated.
- MVT WideVT = VT.getDoubleNumVectorElementsVT();
- bool CanConcat = VT.is128BitVector() ||
- (VT.is256BitVector() && Subtarget.useAVX512Regs());
- if (CanConcat) {
- SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
- if (SDValue ConcatSrc =
- combineConcatVectorOps(DL, WideVT, Ops, DAG, Subtarget)) {
- SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
- DL, WideVT.getSizeInBits());
- SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
- DAG.getVectorIdxConstant(0, DL));
- }
- }
SmallVector<SDValue, 2> SrcOps;
SmallVector<int, 32> Mask;
if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
- // See if we can concatenate the commuted operands.
- if (CanConcat) {
- if (SDValue ConcatSrc = combineConcatVectorOps(
- DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
- Subtarget)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- Mask.append(NumElts, SM_SentinelUndef);
- SDValue Perm =
- lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
- DAG.getUNDEF(WideVT), Subtarget, DAG);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
- DAG.getVectorIdxConstant(0, DL));
- }
- }
SDValue V1 = peekThroughBitcasts(N.getOperand(0));
SDValue V2 = peekThroughBitcasts(N.getOperand(2));
// Canonicalize to VPERMV if both sources are the same.
@@ -42743,6 +42713,37 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
N.getOperand(0), Subtarget, DAG);
}
}
+ // Combine VPERMV3 to widened VPERMV if the two source operands can be
+ // freely concatenated.
+ MVT WideVT = VT.getDoubleNumVectorElementsVT();
+ if (VT.is128BitVector() ||
+ (VT.is256BitVector() && Subtarget.useAVX512Regs())) {
+ SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
+ if (SDValue ConcatSrc =
+ combineConcatVectorOps(DL, WideVT, Ops, DAG, Subtarget)) {
+ SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
+ DL, WideVT.getSizeInBits());
+ SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
+ DAG.getVectorIdxConstant(0, DL));
+ }
+ // See if we can concatenate the commuted operands (and then cheaply
+ // shuffle them, for constant shuffle masks this should fold away).
+ SDValue SwapOps[] = {N.getOperand(2), N.getOperand(0)};
+ if (SDValue ConcatSrc =
+ combineConcatVectorOps(DL, WideVT, SwapOps, DAG, Subtarget)) {
+ SmallVector<int, 16> SwapMask(WideVT.getVectorNumElements());
+ std::iota(SwapMask.begin(), SwapMask.begin() + NumElts, NumElts);
+ std::iota(SwapMask.begin() + NumElts, SwapMask.end(), 0);
+ SDValue Swap = DAG.getVectorShuffle(WideVT, DL, ConcatSrc,
+ DAG.getUNDEF(WideVT), SwapMask);
+ SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
+ DL, WideVT.getSizeInBits());
+ SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, Swap);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
+ DAG.getVectorIdxConstant(0, DL));
+ }
+ }
return SDValue();
}
default:
@@ -43814,6 +43815,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
}
break;
}
+ case X86ISD::VPERMV: {
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if ((VT.is256BitVector() || Subtarget.hasVLX()) &&
+ getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
+ // For lane-crossing shuffles, only split in half in case we're still
+ // referencing higher elements.
+ unsigned HalfElts = NumElts / 2;
+ unsigned HalfSize = SizeInBits / 2;
+ Mask.resize(HalfElts);
+ if (all_of(Mask,
+ [&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
+ MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
+ SDLoc DL(Op);
+ SDValue Ext;
+ SDValue M =
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
+ SDValue V =
+ extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
+ // For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
+ if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
+ Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
+ else
+ Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
+ SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
+ Subtarget, TLO.DAG, DL, SizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
+ case X86ISD::VPERMV3: {
+ SmallVector<int, 16> Mask;
+ SmallVector<SDValue, 2> Ops;
+ if (Subtarget.hasVLX() &&
+ getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
+ // For lane-crossing shuffles, only split in half in case we're still
+ // referencing higher elements.
+ unsigned HalfElts = NumElts / 2;
+ unsigned HalfSize = SizeInBits / 2;
+ Mask.resize(HalfElts);
+ if (all_of(Mask, [&](int M) {
+ return isUndefOrInRange(M, 0, HalfElts) ||
+ isUndefOrInRange(M, NumElts, NumElts + HalfElts);
+ })) {
+ // Adjust mask elements for 2nd operand to point to half width.
+ for (int &M : Mask)
+ M = M <= NumElts ? M : (M - HalfElts);
+ MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
+ MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
+ SDLoc DL(Op);
+ SDValue Ext = TLO.DAG.getNode(
+ Opc, DL, HalfVT,
+ extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
+ getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
+ extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
+ SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
+ Subtarget, TLO.DAG, DL, SizeInBits);
+ return TLO.CombineTo(Op, Insert);
+ }
+ }
+ break;
+ }
case X86ISD::VPERM2X128: {
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
SDLoc DL(Op);
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 6f4e7abda8b00..b075d48627b18 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
;
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
;
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
-; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
+; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
-; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
+; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
@@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
-; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-NEXT: vzeroupper
@@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
-; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 52f856befa130..61e122b1aba36 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 26af46263c0e2..a84466bc1ca1a 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
-; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
+; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2
; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
@@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
-; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
+; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
+; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2
+; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2
; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
+; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0
; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512VBMI-SLOW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 739e6e2369e36..9b19ec15c6f55 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -593,100 +593,104 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <20 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index 05c111ae5049f..f41123c5c3cfd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -123,8 +123,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -140,8 +140,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -157,8 +157,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512DQ-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -174,8 +174,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -191,8 +191,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -208,8 +208,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -225,8 +225,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512DQ-BW-NEXT: vmovdqa %xmm3, 64(%r9)
@@ -242,8 +242,8 @@ define void @store_i64_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,9]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,5]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, 64(%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index c2f1723d8031e..aac6a1bddd08a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -139,12 +139,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -158,12 +158,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -177,12 +177,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512DQ-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -196,12 +196,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -215,12 +215,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -234,12 +234,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -253,12 +253,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-BW-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512DQ-BW-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -272,12 +272,12 @@ define void @store_i64_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, (%r9), %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,3,9,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,4,6,8,10,1,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i64>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
index 1f4228b1fdec9..539b1022f166b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
@@ -29,15 +29,21 @@ define <4 x double> @concat_vpermv3_ops_vpermv_v4f64(ptr %p0, <4 x i64> %m) {
define <4 x double> @concat_vpermv3_ops_vpermv_swap_v4f64(ptr %p0, <4 x i64> %m) {
; X86-LABEL: concat_vpermv3_ops_vpermv_swap_v4f64:
; X86: # %bb.0:
+; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: vmovapd 32(%eax), %ymm1
-; X86-NEXT: vpermi2pd (%eax), %ymm1, %ymm0
+; X86-NEXT: vmovupd (%eax), %zmm1
+; X86-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,0,1,2,3]
+; X86-NEXT: vpermpd %zmm1, %zmm0, %zmm0
+; X86-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; X86-NEXT: retl
;
; X64-LABEL: concat_vpermv3_ops_vpermv_swap_v4f64:
; X64: # %bb.0:
-; X64-NEXT: vmovapd 32(%rdi), %ymm1
-; X64-NEXT: vpermi2pd (%rdi), %ymm1, %ymm0
+; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NEXT: vmovupd (%rdi), %zmm1
+; X64-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,0,1,2,3]
+; X64-NEXT: vpermpd %zmm1, %zmm0, %zmm0
+; X64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; X64-NEXT: retq
%p1 = getelementptr inbounds nuw i8, ptr %p0, i64 32
%lo = load <4 x double>, ptr %p1, align 32
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 35f25d36cb2e9..ea0e3b3a2b9aa 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
;
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
;
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index a598e30845579..a3e2fb5321f32 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
More information about the llvm-commits
mailing list