[llvm-branch-commits] [llvm] 15a3138 - [X86][AVX] LowerBUILD_VECTOR - reduce 256/512-bit build vectors with zero/undef upper elements + pad.
Simon Pilgrim via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Dec 15 02:16:32 PST 2020
Author: Simon Pilgrim
Date: 2020-12-15T10:11:38Z
New Revision: 15a31389b2ead8fa7052a4378b76b5d686d29ad7
URL: https://github.com/llvm/llvm-project/commit/15a31389b2ead8fa7052a4378b76b5d686d29ad7
DIFF: https://github.com/llvm/llvm-project/commit/15a31389b2ead8fa7052a4378b76b5d686d29ad7.diff
LOG: [X86][AVX] LowerBUILD_VECTOR - reduce 256/512-bit build vectors with zero/undef upper elements + pad.
As discussed on D92645, we don't do a good job of recognising when we don't require the full width of a ymm/zmm build vector because the upper elements are undef/zero.
This commit allows us to make use of implicit zeroing of upper elements with AVX instructions, which we emulate in DAG with a INSERT_SUBVECTOR into the bottom of a undef/zero vector of the original type.
This exposed a limitation in getTargetConstantBitsFromNode which didn't extract bits from INSERT_SUBVECTORs of different element widths which I've included as well to prevent a couple of regressions.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/pr29112.ll
llvm/test/CodeGen/X86/pr46532.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/trunc-subvector.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 21af1a5aad00..78d08b36f8bf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6673,23 +6673,26 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
// Insert constant bits from a base and sub vector sources.
if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
- // TODO - support insert_subvector through bitcasts.
- if (EltSizeInBits != VT.getScalarSizeInBits())
- return false;
+ // If bitcasts to larger elements we might lose track of undefs - don't
+ // allow any to be safe.
+ unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+ bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
- APInt UndefSubElts;
- SmallVector<APInt, 32> EltSubBits;
- if (getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
+ APInt UndefSrcElts, UndefSubElts;
+ SmallVector<APInt, 32> EltSrcBits, EltSubBits;
+ if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
UndefSubElts, EltSubBits,
- AllowWholeUndefs, AllowPartialUndefs) &&
- getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
- UndefElts, EltBits, AllowWholeUndefs,
- AllowPartialUndefs)) {
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs) &&
+ getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
+ UndefSrcElts, EltSrcBits,
+ AllowWholeUndefs && AllowUndefs,
+ AllowPartialUndefs && AllowUndefs)) {
unsigned BaseIdx = Op.getConstantOperandVal(2);
- UndefElts.insertBits(UndefSubElts, BaseIdx);
+ UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
- EltBits[BaseIdx + i] = EltSubBits[i];
- return true;
+ EltSrcBits[BaseIdx + i] = EltSubBits[i];
+ return CastBitData(UndefSrcElts, EltSrcBits);
}
}
@@ -10165,17 +10168,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
return VectorConstant;
- BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
- if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
- return AddSub;
- if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
- return HorizontalOp;
- if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
- return Broadcast;
- if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
- return BitOp;
-
unsigned EVTBits = EltVT.getSizeInBits();
+ APInt UndefMask = APInt::getNullValue(NumElems);
APInt ZeroMask = APInt::getNullValue(NumElems);
APInt NonZeroMask = APInt::getNullValue(NumElems);
bool IsAllConstants = true;
@@ -10183,8 +10177,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
- if (Elt.isUndef())
+ if (Elt.isUndef()) {
+ UndefMask.setBit(i);
continue;
+ }
Values.insert(Elt);
if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
@@ -10197,13 +10193,45 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // All undef vector. Return an UNDEF. All zero vectors were handled above.
+ if (NonZeroMask == 0) {
+ assert(UndefMask.isAllOnesValue() && "Fully undef mask expected");
+ return DAG.getUNDEF(VT);
+ }
+
+ BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+
+ // If the upper elts of a ymm/zmm are undef/zero then we might be better off
+ // lowering to a smaller build vector and padding with undef/zero.
+ if ((VT.is256BitVector() || VT.is512BitVector()) &&
+ !isFoldableUseOfShuffle(BV)) {
+ unsigned UpperElems = NumElems / 2;
+ APInt UndefOrZeroMask = UndefMask | ZeroMask;
+ unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();
+ if (NumUpperUndefsOrZeros >= UpperElems) {
+ if (VT.is512BitVector() &&
+ NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
+ UpperElems = NumElems - (NumElems / 4);
+ bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;
+ MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
+ SDValue NewBV =
+ DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
+ return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
+ }
+ }
+
+ if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
+ return AddSub;
+ if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
+ return HorizontalOp;
+ if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
+ return Broadcast;
+ if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
+ return BitOp;
+
unsigned NumZero = ZeroMask.countPopulation();
unsigned NumNonZero = NonZeroMask.countPopulation();
- // All undef vector. Return an UNDEF. All zero vectors were handled above.
- if (NumNonZero == 0)
- return DAG.getUNDEF(VT);
-
// If we are inserting one variable into a vector of non-zero constants, try
// to avoid loading each constant element as a scalar. Load the constants as a
// vector and then insert the variable scalar element. If insertion is not
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index c445a522bf2f..58fd4c9c586c 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -6,8 +6,7 @@
define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
-; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,6,12,4,7,9,14,8]
; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
@@ -18,8 +17,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,8,6,12,4,7,9,14,8]
-; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -48,8 +46,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,12,9,4,14,15,12,14,4,12,9,4,14,15,12,14]
-; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -78,8 +75,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,4,11,14,10,7,1,6,9]
-; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -108,8 +104,7 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x
define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
-; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,15,7,13,4,12,8,0]
; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
@@ -120,8 +115,7 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,15,7,13,4,12,8,0,14,15,7,13,4,12,8,0]
-; CHECK-NEXT: # ymm3 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm0
; CHECK-NEXT: vptestnmw %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
@@ -898,8 +892,7 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mem_mask3(<32 x i16>* %vp,
define <8 x i16> @test_16xi16_to_8xi16_E84C94EF(<16 x i16> %vec) {
; CHECK-LABEL: test_16xi16_to_8xi16_E84C94EF:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [14,8,4,12,9,4,14,15,14,8,4,12,9,4,14,15]
-; CHECK-NEXT: # ymm1 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [14,8,4,12,9,4,14,15]
; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
@@ -1196,9 +1189,8 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mem_mask3(<8 x i32>* %vp, <4
define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,13,11,14,7,10,1,6]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
@@ -1207,8 +1199,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [1,13,11,14,7,10,1,6,1,13,11,14,7,10,1,6]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
@@ -1235,8 +1226,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,0,15,3,2,3,6,8,3,0,15,3,2,3,6,8]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
@@ -1263,8 +1253,7 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,15,15,2,6,10,14,7,2,15,15,2,6,10,14,7]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
@@ -1291,9 +1280,8 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x
define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
; CHECK-LABEL: test_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [14,5,7,7,10,3,9,3]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
@@ -1302,8 +1290,7 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [14,5,7,7,10,3,9,3,14,5,7,7,10,3,9,3]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmd %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
@@ -1952,8 +1939,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,4,6,1,6,4,6,1]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,4,6,1]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -1980,8 +1966,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [6,3,6,3,6,3,6,3]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,3,6,3]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -1995,8 +1980,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [6,3,6,3]
-; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,3,6,3]
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
@@ -2009,9 +1993,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i
define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [6,0,0,7,6,0,0,7]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [6,0,0,7]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
@@ -2020,8 +2003,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [6,0,0,7,6,0,0,7]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [6,0,0,7]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -2048,8 +2030,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,7,7,5,3,7,7,5]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,7,7,5]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -2076,8 +2057,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,1,0,6,4,1,0,6]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,1,0,6]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -2104,9 +2084,8 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i
define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,6,5,3,7,6,5,3]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,3]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
@@ -2115,8 +2094,7 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [7,6,5,3,7,6,5,3]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [7,6,5,3]
; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
@@ -2896,9 +2874,8 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mem_mask3(<8 x float>*
define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
@@ -2907,10 +2884,9 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,4,12,10,8,2,11,7,0,4,12,10,8,2,11,7]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -2937,10 +2913,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %v
define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [10,12,3,12,4,15,1,14,10,12,3,12,4,15,1,14]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -2996,9 +2971,8 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask2(<16 x float> %v
define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
; CHECK-LABEL: test_16xfloat_to_8xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [12,14,9,0,12,4,5,8]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
@@ -3007,10 +2981,9 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [12,14,9,0,12,4,5,8,12,14,9,0,12,4,5,8]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqps %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -3651,9 +3624,8 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(<4 x doub
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [7,3,7,3,7,3,7,3]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [7,3,7,3]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
@@ -3662,10 +3634,9 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [7,3,7,3,7,3,7,3]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [7,3,7,3]
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -3678,8 +3649,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %v
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [7,3,7,3]
-; CHECK-NEXT: # ymm2 = mem[0,1,0,1]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [7,3,7,3]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3693,10 +3663,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double>
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,7,6,2,0,7,6]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,0,7,6]
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -3749,9 +3718,8 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask2(<8 x double>
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,1,4,0,2,1,4]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,1,4]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
@@ -3760,10 +3728,9 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,0,2,1,4]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,2,1,4]
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -3821,10 +3788,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double>
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,6,2,2,2,6,2,2]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [2,6,2,2]
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
@@ -3851,8 +3817,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [5,8,7,8,5,8,7,8]
-; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [5,8,7,8]
; CHECK-NEXT: vpermt2pd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
@@ -3862,8 +3827,7 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [5,8,7,8,5,8,7,8]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [5,8,7,8]
; CHECK-NEXT: vpermi2pd %zmm0, %zmm0, %zmm3
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
@@ -3892,10 +3856,9 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double>
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,5,0,6,3,5,0,6]
-; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,0,6]
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index 8db4d7713ada..1cfa810f1732 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -11,16 +11,14 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
; CHECK-NEXT: vmovaps %xmm1, %xmm9
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17]
-; CHECK-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm14 = [4,22,1,17]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm14
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22]
-; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm10 = [4,30,1,22]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
-; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
-; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
+; CHECK-NEXT: vmovaps {{.*#+}} xmm7 = <5,20,u,u>
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index 3e0638b24f35..39a0449dc4d2 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
; CHECK-NEXT: movq (%rax), %rax
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [4294967295,4294967295,0,0,0,0,0,0]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
while.1.body.preheader:
%0 = load i8*, i8** undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index e6821daa97ca..288fbb6a76f9 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -465,9 +465,9 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
;
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512BWVL-NEXT: retq
@@ -484,9 +484,9 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
;
; AVX512VBMIVL-LABEL: PR34175:
; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
+; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = <0,8,16,24,u,u,u,u>
+; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
; AVX512VBMIVL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll
index 7373bbf68029..9db2f6ba1e81 100644
--- a/llvm/test/CodeGen/X86/trunc-subvector.ll
+++ b/llvm/test/CodeGen/X86/trunc-subvector.ll
@@ -79,7 +79,7 @@ define <2 x i32> @test5(<8 x i32> %v) {
;
; AVX2-LABEL: test5:
; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [3,4,4,4]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
@@ -177,7 +177,7 @@ define <2 x i32> @test10(<8 x i32> %v) {
;
; AVX2-LABEL: test10:
; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [17179869187,17179869187,17179869187,17179869187]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [3,4,4,4]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 19ea28085d75..ac1144818b07 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -354,7 +354,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,8]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -396,7 +396,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,9,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -437,7 +437,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,10,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -478,7 +478,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -519,7 +519,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -560,7 +560,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
@@ -601,7 +601,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 4a4a9d115cb0..a9d9798ebc7a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -812,7 +812,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -859,7 +859,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -906,7 +906,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,0,18,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -953,7 +953,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,0,19,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1000,7 +1000,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_20_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,0,20,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1047,7 +1047,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_21_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,0,21,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1094,7 +1094,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_22_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1141,7 +1141,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_23_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,23,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1188,7 +1188,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1235,7 +1235,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1282,7 +1282,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1329,7 +1329,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1376,7 +1376,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1423,7 +1423,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
@@ -1470,7 +1470,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
-; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 7bbff8767922..f1af4faf67e2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -136,7 +136,7 @@ define <8 x float> @shuffle_v8f32_00040000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00040000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@@ -153,7 +153,7 @@ define <8 x float> @shuffle_v8f32_00500000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_00500000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -170,7 +170,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1470,7 +1470,7 @@ define <8 x i32> @shuffle_v8i32_00040000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00040000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4,0,0,0,0]
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,4]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@@ -1487,7 +1487,7 @@ define <8 x i32> @shuffle_v8i32_00500000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00500000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,5,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1504,7 +1504,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,6,0,0,0,0,0,0]
+; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -3280,15 +3280,13 @@ define <8 x i32> @lowhalf_v8i32(<8 x i32> %x, <8 x i32> %y) {
; AVX2-LABEL: lowhalf_v8i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6]
-; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,3,6]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: lowhalf_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14]
-; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,14,3,14]
; AVX512VL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -3308,15 +3306,13 @@ define <8 x float> @lowhalf_v8f32(<8 x float> %x, <8 x float> %y) {
; AVX2-LABEL: lowhalf_v8f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,6,2,6,3,6]
-; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,3,6]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: lowhalf_v8f32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,14,3,14,2,14,3,14]
-; AVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512VL-NEXT: vmovaps {{.*#+}} xmm2 = [2,14,3,14]
; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
; AVX512VL-NEXT: retq
%r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 14, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 60ae9b584053..8e1abdb822a1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -388,9 +388,8 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
; ALL: # %bb.0:
-; ALL-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,1,2,3,4,6,7,10,0,1,2,3,4,6,7,10]
-; ALL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,6,7,10]
+; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; ALL-NEXT: retq
%res = shufflevector <16 x float> %v, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 10>
@@ -401,9 +400,8 @@ define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
define <4 x float> @test_v16f32_0_1_3_6 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_3_6:
; ALL: # %bb.0:
-; ALL-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [0,1,3,6,0,1,3,6,0,1,3,6,0,1,3,6]
-; ALL-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; ALL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,1,3,6]
+; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index f582a31a607d..8288ae85368b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -94,13 +94,13 @@ define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00040000:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
+; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00040000:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@@ -110,13 +110,13 @@ define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_00500000:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
+; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_00500000:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -126,13 +126,13 @@ define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_06000000:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_06000000:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,6,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -912,13 +912,13 @@ define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_00040000:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,4,0,0,0,0]
+; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,4]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00040000:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,0,0,4,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
@@ -928,13 +928,13 @@ define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_00500000:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,5,0,0,0,0,0]
+; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,5,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_00500000:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,0,5,0,0,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -944,13 +944,13 @@ define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
; AVX512F-LABEL: shuffle_v8i64_06000000:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,6,0,0,0,0,0,0]
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_06000000:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [0,0,6,0]
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -2187,17 +2187,15 @@ define <8 x double> @shuffle_v2f64_v8f64_01010101(<2 x double> %a) {
define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
; AVX512F-LABEL: test_v8f64_2346:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [2,3,4,6,2,3,4,6]
-; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [2,3,4,6]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8f64_2346:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [2,0,3,0,4,0,6,0,2,0,3,0,4,0,6,0]
-; AVX512F-32-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [2,0,3,0,4,0,6,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-32-NEXT: retl
%res = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 6>
@@ -2221,17 +2219,15 @@ define <2 x double> @test_v8f64_34 (<8 x double> %v) {
define <4 x i64> @test_v8i64_1257 (<8 x i64> %v) {
; AVX512F-LABEL: test_v8i64_1257:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,2,5,7,1,2,5,7]
-; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,5,7]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: test_v8i64_1257:
; AVX512F-32: # %bb.0:
-; AVX512F-32-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [1,0,2,0,5,0,7,0,1,0,2,0,5,0,7,0]
-; AVX512F-32-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vmovaps {{.*#+}} ymm1 = [1,0,2,0,5,0,7,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-32-NEXT: retl
%res = shufflevector <8 x i64> %v, <8 x i64> undef, <4 x i32> <i32 1, i32 2, i32 5, i32 7>
More information about the llvm-branch-commits
mailing list