[llvm] r240258 - Reverted AVX-512 vector shuffle
Elena Demikhovsky
elena.demikhovsky at intel.com
Mon Jun 22 02:01:22 PDT 2015
Author: delena
Date: Mon Jun 22 04:01:15 2015
New Revision: 240258
URL: http://llvm.org/viewvc/llvm-project?rev=240258&view=rev
Log:
Reverted AVX-512 vector shuffle
Removed:
llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=240258&r1=240257&r2=240258&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 22 04:01:15 2015
@@ -6259,42 +6259,6 @@ is128BitLaneRepeatedShuffleMask(MVT VT,
return true;
}
-/// \brief Test whether a shuffle mask is equivalent within each 256-bit lane.
-///
-/// This checks a shuffle mask to see if it is performing the same
-/// 256-bit lane-relative shuffle in each 256-bit lane. This trivially implies
-/// that it is also not lane-crossing. It may however involve a blend from the
-/// same lane of a second vector.
-///
-/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
-/// non-trivial to compute in the face of undef lanes. The representation is
-/// *not* suitable for use with existing 256-bit shuffles as it will contain
-/// entries from both V1 and V2 inputs to the wider mask.
-static bool
-is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
- SmallVectorImpl<int> &RepeatedMask) {
- int LaneSize = 256 / VT.getScalarSizeInBits();
- RepeatedMask.resize(LaneSize, -1);
- int Size = Mask.size();
- for (int i = 0; i < Size; ++i) {
- if (Mask[i] < 0)
- continue;
- if ((Mask[i] % Size) / LaneSize != i / LaneSize)
- // This entry crosses lanes, so there is no way to model this shuffle.
- return false;
-
- // Ok, handle the in-lane shuffles by detecting if and when they repeat.
- if (RepeatedMask[i % LaneSize] == -1)
- // This is the first non-undef entry in this slot of a 256-bit lane.
- RepeatedMask[i % LaneSize] =
- Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
- else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
- // Found a mismatch with the repeated mask.
- return false;
- }
- return true;
-}
-
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@@ -6354,22 +6318,6 @@ static SDValue getV4X86ShuffleImm8ForMas
return DAG.getConstant(Imm, DL, MVT::i8);
}
-/// \brief Get a 8-bit shuffle, 1 bit per lane, immediate for a mask.
-///
-/// This helper function produces an 8-bit shuffle immediate corresponding to
-/// the ubiquitous shuffle encoding scheme used in x86 instructions for
-/// shuffling 8 lanes.
-static SDValue get1bitLaneShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
- SelectionDAG &DAG) {
- assert(Mask.size() <= 8 &&
- "Up to 8 elts may be in Imm8 1-bit lane shuffle mask");
- unsigned Imm = 0;
- for (unsigned i = 0; i < Mask.size(); ++i)
- if (Mask[i] >= 0)
- Imm |= (Mask[i] % 2) << i;
- return DAG.getConstant(Imm, DL, MVT::i8);
-}
-
/// \brief Try to emit a blend instruction for a shuffle using bit math.
///
/// This is used as a fallback approach when first class blend instructions are
@@ -9385,30 +9333,6 @@ static SDValue lowerV2X128VectorShuffle(
DAG.getConstant(PermMask, DL, MVT::i8));
}
-/// \brief Handle lowering 4-lane 128-bit shuffles.
-static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> WidenedMask,
- SelectionDAG &DAG) {
-
- assert(WidenedMask.size() == 4 && "Unexpected mask size for 128bit shuffle!");
- // form a 128-bit permutation.
- // convert the 64-bit shuffle mask selection values into 128-bit selection
- // bits defined by a vshuf64x2 instruction's immediate control byte.
- unsigned PermMask = 0, Imm = 0;
-
- for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
- if(WidenedMask[i] == SM_SentinelZero)
- return SDValue();
-
- // use first element in place of undef musk
- Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
- PermMask |= (Imm % 4) << (i * 2);
- }
-
- return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
- DAG.getConstant(PermMask, DL, MVT::i8));
-}
-
/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
/// shuffling each lane.
///
@@ -10144,105 +10068,86 @@ static SDValue lower256BitVectorShuffle(
}
}
-static SDValue lowerVectorShuffleWithVALIGN(SDLoc DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
+/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
+static SDValue lowerV8F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- assert(VT.getScalarSizeInBits() >= 32 && "Unexpected data type for VALIGN");
- // VALIGN pattern 2, 3, 4, 5, .. (sequential, shifted right)
- int AlignVal = -1;
- for (int i = 0; i < (signed)VT.getVectorNumElements(); ++i) {
- if (Mask[i] < 0)
- continue;
- if (Mask[i] < i)
- return SDValue();
- if (AlignVal == -1)
- AlignVal = Mask[i] - i;
- else if (Mask[i] - i != AlignVal)
- return SDValue();
- }
- // Vector source operands should be swapped
- return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
- DAG.getConstant(AlignVal, DL, MVT::i8));
-}
-
-static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
- ArrayRef<int> Mask, SDValue V1,
- SDValue V2, SelectionDAG &DAG) {
-
- assert(VT.getScalarSizeInBits() >= 16 && "Unexpected data type for PERMV");
-
- MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
- MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
-
- SmallVector<SDValue, 32> VPermMask;
- for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
- VPermMask.push_back(Mask[i] < 0 ? DAG.getUNDEF(MaskEltVT) :
- DAG.getConstant(Mask[i], DL,MaskEltVT));
- SDValue MaskNode = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecVT,
- VPermMask);
- if (isSingleInputShuffleMask(Mask))
- return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+ // X86 has dedicated unpack instructions that can handle specific blend
+ // operations: UNPCKH and UNPCKL.
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8f64, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8f64, V1, V2);
- return DAG.getNode(X86ISD::VPERMV3, DL, VT, MaskNode, V1, V2);
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerVectorShuffle(DL, MVT::v8f64, V1, V2, Mask, DAG);
}
+/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
+static SDValue lowerV16F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ ArrayRef<int> Mask = SVOp->getMask();
+ assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
-/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
-static SDValue lowerV8X64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+ // Use dedicated unpack instructions for masks that match their pattern.
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 0, 16, 1, 17, 4, 20, 5, 21,
+ // Second 128-bit lane.
+ 8, 24, 9, 25, 12, 28, 13, 29}))
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask,
+ {// First 128-bit lane.
+ 2, 18, 3, 19, 6, 22, 7, 23,
+ // Second 128-bit lane.
+ 10, 26, 11, 27, 14, 30, 15, 31}))
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16f32, V1, V2);
+
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerVectorShuffle(DL, MVT::v16f32, V1, V2, Mask, DAG);
+}
+
+/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
+static SDValue lowerV8I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
SDLoc DL(Op);
- MVT VT = Op.getSimpleValueType();
- assert((V1.getSimpleValueType() == MVT::v8f64 ||
- V1.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
- assert((V2.getSimpleValueType() == MVT::v8f64 ||
- V2.getSimpleValueType() == MVT::v8i64) && "Bad operand type!");
+ assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- SmallVector<int, 4> WidenedMask;
- if (canWidenShuffleElements(Mask, WidenedMask))
- if(SDValue Op = lowerV4X128VectorShuffle(DL, VT, V1, V2, WidenedMask, DAG))
- return Op;
// X86 has dedicated unpack instructions that can handle specific blend
// operations: UNPCKH and UNPCKL.
if (isShuffleEquivalent(V1, V2, Mask, {0, 8, 2, 10, 4, 12, 6, 14}))
- return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i64, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask, {1, 9, 3, 11, 5, 13, 7, 15}))
- return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
-
- if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
- return Op;
-
- if (SDValue Op = lowerVectorShuffleWithSHUFPD(DL, VT, Mask, V1, V2, DAG))
- return Op;
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i64, V1, V2);
- // PERMILPD instruction - mask 0/1, 0/1, 2/3, 2/3, 4/5, 4/5, 6/7, 6/7
- if (isSingleInputShuffleMask(Mask)) {
- if (!is128BitLaneCrossingShuffleMask(VT, Mask))
- return DAG.getNode(X86ISD::VPERMILPI, DL, VT, V1,
- get1bitLaneShuffleImm8ForMask(Mask, DL, DAG));
-
- SmallVector<int, 4> RepeatedMask;
- if (is256BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
- return DAG.getNode(X86ISD::VPERMI, DL, VT, V1,
- getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
- }
- return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerVectorShuffle(DL, MVT::v8i64, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
-static SDValue lowerV16X32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
+static SDValue lowerV16I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
- assert((V1.getSimpleValueType() == MVT::v16i32 ||
- V1.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
- assert((V2.getSimpleValueType() == MVT::v16i32 ||
- V2.getSimpleValueType() == MVT::v16f32) && "Bad operand type!");
+ assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
+ assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
@@ -10253,39 +10158,16 @@ static SDValue lowerV16X32VectorShuffle(
0, 16, 1, 17, 4, 20, 5, 21,
// Second 128-bit lane.
8, 24, 9, 25, 12, 28, 13, 29}))
- return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
+ return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i32, V1, V2);
if (isShuffleEquivalent(V1, V2, Mask,
{// First 128-bit lane.
2, 18, 3, 19, 6, 22, 7, 23,
// Second 128-bit lane.
10, 26, 11, 27, 14, 30, 15, 31}))
- return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
-
- if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
- 12, 12, 14, 14}))
- return DAG.getNode(X86ISD::MOVSLDUP, DL, VT, V1);
- if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11,
- 13, 13, 15, 15}))
- return DAG.getNode(X86ISD::MOVSHDUP, DL, VT, V1);
-
- SmallVector<int, 4> RepeatedMask;
- if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) {
- if (isSingleInputShuffleMask(Mask)) {
- unsigned Opc = VT.isInteger() ? X86ISD::PSHUFD : X86ISD::VPERMILPI;
- return DAG.getNode(Opc, DL, VT, V1,
- getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
- }
-
- for (int i = 0; i < 4; ++i)
- if (RepeatedMask[i] >= 16)
- RepeatedMask[i] -= 12;
- return lowerVectorShuffleWithSHUFPS(DL, VT, RepeatedMask, V1, V2, DAG);
- }
-
- if (SDValue Op = lowerVectorShuffleWithVALIGN(DL, VT, Mask, V1, V2, DAG))
- return Op;
+ return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i32, V1, V2);
- return lowerVectorShuffleWithPERMV(DL, VT, Mask, V1, V2, DAG);
+ // FIXME: Implement direct support for this type!
+ return splitAndLowerVectorShuffle(DL, MVT::v16i32, V1, V2, Mask, DAG);
}
/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
@@ -10345,11 +10227,13 @@ static SDValue lower512BitVectorShuffle(
// the requisite ISA extensions for that element type are available.
switch (VT.SimpleTy) {
case MVT::v8f64:
- case MVT::v8i64:
- return lowerV8X64VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV8F64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16f32:
+ return lowerV16F32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ case MVT::v8i64:
+ return lowerV8I64VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v16i32:
- return lowerV16X32VectorShuffle(Op, V1, V2, Subtarget, DAG);
+ return lowerV16I32VectorShuffle(Op, V1, V2, Subtarget, DAG);
case MVT::v32i16:
if (Subtarget->hasBWI())
return lowerV32I16VectorShuffle(Op, V1, V2, Subtarget, DAG);
Modified: llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll?rev=240258&r1=240257&r2=240258&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-build-vector.ll Mon Jun 22 04:01:15 2015
@@ -1,15 +1,5 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-define <16 x i32> @test1(i32* %x) {
-; CHECK-LABEL: test1:
-; CHECK: vmovd (%rdi), %xmm
-; CHECK: vmovdqa32
-; CHECK: vpermt2d %zmm
- %y = load i32, i32* %x, align 4
- %res = insertelement <16 x i32>zeroinitializer, i32 %y, i32 4
- ret <16 x i32>%res
-}
-
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
Removed: llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll?rev=240257&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle.ll (removed)
@@ -1,392 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX
-
-; CHECK-LABEL: test1:
-; CHECK: vpermps
-; CHECK: ret
-define <16 x float> @test1(<16 x float> %a) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
- ret <16 x float> %c
-}
-
-; CHECK-LABEL: test2:
-; CHECK: vpermd
-; CHECK: ret
-define <16 x i32> @test2(<16 x i32> %a) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1, i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test3:
-; CHECK: vpermq
-; CHECK: ret
-define <8 x i64> @test3(<8 x i64> %a) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 5, i32 1, i32 undef, i32 7, i32 undef, i32 3, i32 1>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test4:
-; CHECK: vpermpd
-; CHECK: ret
-define <8 x double> @test4(<8 x double> %a) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test5:
-; CHECK: vpermt2pd
-; CHECK: ret
-define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test6:
-; CHECK: vpermq $30
-; CHECK: ret
-define <8 x i64> @test6(<8 x i64> %a) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test7:
-; CHECK: vpermt2q
-; CHECK: ret
-define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
- %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
- ret <8 x i64> %c
-}
-
-; CHECK-LABEL: test8:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test9:
-; CHECK: vpermt2ps
-; CHECK: ret
-define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
- %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x float> %c
-}
-
-; CHECK-LABEL: test10:
-; CHECK: vpermt2ps (
-; CHECK: ret
-define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
- %c = load <16 x float>, <16 x float>* %b
- %d = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x float> %d
-}
-
-; CHECK-LABEL: test11:
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
- %c = load <16 x i32>, <16 x i32>* %b
- %d = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
- ret <16 x i32> %d
-}
-
-; CHECK-LABEL: test13
-; CHECK: vpermilps $177, %zmm
-; CHECK: ret
-define <16 x float> @test13(<16 x float> %a) {
- %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test14
-; CHECK: vpermilpd $203, %zmm
-; CHECK: ret
-define <8 x double> @test14(<8 x double> %a) {
- %b = shufflevector <8 x double> %a, <8 x double> undef, <8 x i32><i32 1, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 7>
- ret <8 x double> %b
-}
-
-; CHECK-LABEL: test15
-; CHECK: vpshufd $177, %zmm
-; CHECK: ret
-define <16 x i32> @test15(<16 x i32> %a) {
-; mask 1-0-3-2 = 10110001 = 0xb1 = 177
- %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i32> %b
-}
-; CHECK-LABEL: test16
-; CHECK: valignq $3, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test16(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test17
-; CHECK: vshufpd $19, %zmm1, %zmm0
-; CHECK: ret
-define <8 x double> @test17(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 9, i32 2, i32 10, i32 5, i32 undef, i32 undef, i32 undef>
- ret <8 x double> %c
-}
-
-; CHECK-LABEL: test18
-; CHECK: vpunpckhdq %zmm
-; CHECK: ret
-define <16 x i32> @test18(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test19
-; CHECK: vpunpckldq %zmm
-; CHECK: ret
-define <16 x i32> @test19(<16 x i32> %a, <16 x i32> %c) {
- %b = shufflevector <16 x i32> %a, <16 x i32> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
- ret <16 x i32> %b
-}
-
-; CHECK-LABEL: test20
-; CHECK: vpunpckhqdq %zmm
-; CHECK: ret
-define <8 x i64> @test20(<8 x i64> %a, <8 x i64> %c) {
- %b = shufflevector <8 x i64> %a, <8 x i64> %c, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
- ret <8 x i64> %b
-}
-
-; CHECK-LABEL: test21
-; CHECK: vbroadcastsd %xmm0, %zmm
-; CHECK: ret
-define <8 x double> @test21(<8 x double> %a, <8 x double> %b) {
- %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- ret <8 x double> %shuffle
-}
-
-; CHECK-LABEL: test22
-; CHECK: vpbroadcastq %xmm0, %zmm
-; CHECK: ret
-define <8 x i64> @test22(<8 x i64> %a, <8 x i64> %b) {
- %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- ret <8 x i64> %shuffle
-}
-
-; CHECK-LABEL: @test23
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: ret
-define <16 x i32> @test23(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test24
-; CHECK: vpermt2d
-; CHECK: ret
-define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 25, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test25
-; CHECK: vshufps $52
-; CHECK: ret
-define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
-; mask - 0-1-3-0 00110100 = 0x34 = 52
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 19, i32 16, i32 4, i32 5, i32 23, i32 undef, i32 8, i32 9, i32 27, i32 undef, i32 12, i32 13, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test26
-; CHECK: vmovshdup
-; CHECK: ret
-define <16 x i32> @test26(<16 x i32> %a) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: @test27
-; CHECK: ret
-define <16 x i32> @test27(<4 x i32>%a) {
- %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
- ret <16 x i32> %res
-}
-
-; CHECK-LABEL: test28
-; CHECK: vpshufhw $177, %ymm
-; CHECK: ret
-define <16 x i16> @test28(<16 x i16> %a) {
- %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32><i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 7, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 12, i32 15, i32 14>
- ret <16 x i16> %b
-}
-
-; CHECK-LABEL: test29
-; CHECK: vunpcklps %zmm
-; CHECK: ret
-define <16 x float> @test29(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: @test30
-; CHECK: vshufps $144, %zmm
-; CHECK: ret
-define <16 x float> @test30(<16 x float> %a, <16 x float> %c) {
- %b = shufflevector <16 x float> %a, <16 x float> %c, <16 x i32><i32 0, i32 0, i32 17, i32 18, i32 4, i32 4, i32 21, i32 22, i32 8, i32 8, i32 25, i32 26, i32 12, i32 12, i32 29, i32 30>
- ret <16 x float> %b
-}
-
-; CHECK-LABEL: test31
-; CHECK: valignd $3, %zmm0, %zmm1
-; CHECK: ret
-define <16 x i32> @test31(<16 x i32> %a, <16 x i32> %b) nounwind {
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- ret <16 x i32> %c
-}
-
-; CHECK-LABEL: test32
-; CHECK: vshufpd $99, %zmm0, %zmm1
-; CHECK: ret
-define <8 x double> @test32(<8 x double> %a, <8 x double> %b) nounwind {
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef>
- ret <8 x double> %c
-}
-
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
-; CHECK-LABEL: test_vshuff64x2_512:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
- ret <8 x double> %res
-}
-
-define <8 x double> @test_vshuff64x2_512_mask(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
-; CHECK-LABEL: test_vshuff64x2_512_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vshuff64x2 $136, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
- %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 0, i32 1, i32 4, i32 5>
- %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
- ret <8 x double> %res
-}
-
-define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
-; CHECK-LABEL: test_vshufi64x2_512_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxwq %xmm2, %zmm1
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: vshufi64x2 $168, %zmm0, %zmm0, %zmm0 {%k1}
-; CHECK-NEXT: retq
- %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 4, i32 5>
- %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
- ret <8 x i64> %res
-}
-
-define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
-; CHECK-LABEL: test_vshuff64x2_512_mem:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vshuff64x2 $40, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %x1 = load <8 x double>,<8 x double> *%ptr,align 1
- %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 4, i32 5, i32 0, i32 1>
- ret <8 x double> %res
-}
-
-define <16 x float> @test_vshuff32x4_512_mem(<16 x float> %x, <16 x float> *%ptr) nounwind {
-; CHECK-LABEL: test_vshuff32x4_512_mem:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vshuff64x2 $20, %zmm0, %zmm0, %zmm0
-; CHECK-NEXT: retq
- %x1 = load <16 x float>,<16 x float> *%ptr,align 1
- %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
- ret <16 x float> %res
-}
-
-define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
-; CHECK-LABEL: test_align_v16i32_rr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- ret <16 x i32> %c
-}
-
-define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind {
-; CHECK-LABEL: test_align_v16i32_rm:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0
-; CHECK-NEXT: retq
- %a = load <16 x i32>, <16 x i32>* %a.ptr
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- ret <16 x i32> %c
-}
-
-define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind {
-; CHECK-LABEL: test_align_v16i32_rm_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
-; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
-; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
-; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-NEXT: retq
-;
-; CHECK-SKX-LABEL: test_align_v16i32_rm_mask:
-; CHECK-SKX: ## BB#0:
-; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1
-; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1
-; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
-; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0
-; CHECK-SKX-NEXT: retq
- %a = load <16 x i32>, <16 x i32>* %a.ptr
- %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
- %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a
- ret <16 x i32> %res
-}
-
-define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind {
-; CHECK-LABEL: test_align_v8f64_rr:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0
-; CHECK-NEXT: retq
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- ret <8 x double> %c
-}
-
-define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind {
-; CHECK-LABEL: test_align_v18f64_rm:
-; CHECK: ## BB#0:
-; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0
-; CHECK-NEXT: retq
- %a = load <8 x double>, <8 x double>* %a.ptr
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- ret <8 x double> %c
-}
-
-define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind {
-; CHECK-LABEL: test_align_v18f64_rm_mask:
-; CHECK: ## BB#0:
-; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1
-; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
-; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT: retq
-;
-; CHECK-SKX-LABEL: test_align_v18f64_rm_mask:
-; CHECK-SKX: ## BB#0:
-; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1
-; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-SKX-NEXT: retq
- %a = load <8 x double>, <8 x double>* %a.ptr
- %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
- %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer
- ret <8 x double> %res
-}
-
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll?rev=240258&r1=240257&r2=240258&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll Mon Jun 22 04:01:15 2015
@@ -15,8 +15,9 @@ define <8 x double> @shuffle_v8f64_00000
define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000010:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x double> %shuffle
@@ -25,8 +26,9 @@ define <8 x double> @shuffle_v8f64_00000
define <8 x double> @shuffle_v8f64_00000200(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00000200:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -35,8 +37,9 @@ define <8 x double> @shuffle_v8f64_00000
define <8 x double> @shuffle_v8f64_00003000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00003000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -45,8 +48,11 @@ define <8 x double> @shuffle_v8f64_00003
define <8 x double> @shuffle_v8f64_00040000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00040000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -55,8 +61,11 @@ define <8 x double> @shuffle_v8f64_00040
define <8 x double> @shuffle_v8f64_00500000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00500000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,0]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -65,8 +74,11 @@ define <8 x double> @shuffle_v8f64_00500
define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_06000000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,0]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -75,11 +87,11 @@ define <8 x double> @shuffle_v8f64_06000
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_70000000:
; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: movl $7, %eax
-; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
-; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,0,0,0]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x double> %shuffle
@@ -88,7 +100,10 @@ define <8 x double> @shuffle_v8f64_70000
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01014545:
; ALL: # BB#0:
-; ALL-NEXT: vshuff64x2 $160, %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -97,8 +112,9 @@ define <8 x double> @shuffle_v8f64_01014
define <8 x double> @shuffle_v8f64_00112233(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00112233:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x double> %shuffle
@@ -107,8 +123,9 @@ define <8 x double> @shuffle_v8f64_00112
define <8 x double> @shuffle_v8f64_00001111(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00001111:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x double> %shuffle
@@ -117,7 +134,11 @@ define <8 x double> @shuffle_v8f64_00001
define <8 x double> @shuffle_v8f64_81a3c5e7(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_81a3c5e7:
; ALL: # BB#0:
-; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x double> %shuffle
@@ -126,9 +147,10 @@ define <8 x double> @shuffle_v8f64_81a3c
define <8 x double> @shuffle_v8f64_08080808(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08080808:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x double> %shuffle
@@ -137,9 +159,15 @@ define <8 x double> @shuffle_v8f64_08080
define <8 x double> @shuffle_v8f64_08084c4c(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08084c4c:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3]
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x double> %shuffle
@@ -148,9 +176,13 @@ define <8 x double> @shuffle_v8f64_08084
define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_8823cc67:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm3, %ymm3
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -159,9 +191,13 @@ define <8 x double> @shuffle_v8f64_8823c
define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_9832dc76:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -170,9 +206,13 @@ define <8 x double> @shuffle_v8f64_9832d
define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_9810dc54:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -181,9 +221,15 @@ define <8 x double> @shuffle_v8f64_9810d
define <8 x double> @shuffle_v8f64_08194c5d(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08194c5d:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x double> %shuffle
@@ -192,9 +238,15 @@ define <8 x double> @shuffle_v8f64_08194
define <8 x double> @shuffle_v8f64_2a3b6e7f(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_2a3b6e7f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x double> %shuffle
@@ -203,9 +255,13 @@ define <8 x double> @shuffle_v8f64_2a3b6
define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08192a3b:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x double> %shuffle
@@ -214,9 +270,11 @@ define <8 x double> @shuffle_v8f64_08192
define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08991abb:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,2,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x double> %shuffle
@@ -225,9 +283,12 @@ define <8 x double> @shuffle_v8f64_08991
define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_091b2d3f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x double> %shuffle
@@ -236,9 +297,11 @@ define <8 x double> @shuffle_v8f64_091b2
define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_09ab1def:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x double> %shuffle
@@ -247,7 +310,10 @@ define <8 x double> @shuffle_v8f64_09ab1
define <8 x double> @shuffle_v8f64_00014445(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00014445:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $64, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -256,7 +322,10 @@ define <8 x double> @shuffle_v8f64_00014
define <8 x double> @shuffle_v8f64_00204464(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00204464:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $32, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x double> %shuffle
@@ -265,7 +334,10 @@ define <8 x double> @shuffle_v8f64_00204
define <8 x double> @shuffle_v8f64_03004744(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_03004744:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $12, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -274,7 +346,10 @@ define <8 x double> @shuffle_v8f64_03004
define <8 x double> @shuffle_v8f64_10005444(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10005444:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $1, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -283,7 +358,10 @@ define <8 x double> @shuffle_v8f64_10005
define <8 x double> @shuffle_v8f64_22006644(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_22006644:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $10, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -292,7 +370,10 @@ define <8 x double> @shuffle_v8f64_22006
define <8 x double> @shuffle_v8f64_33307774(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_33307774:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $63, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -301,7 +382,10 @@ define <8 x double> @shuffle_v8f64_33307
define <8 x double> @shuffle_v8f64_32107654(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_32107654:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd $27, %zmm0, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x double> %shuffle
@@ -310,7 +394,10 @@ define <8 x double> @shuffle_v8f64_32107
define <8 x double> @shuffle_v8f64_00234467(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00234467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -319,7 +406,10 @@ define <8 x double> @shuffle_v8f64_00234
define <8 x double> @shuffle_v8f64_00224466(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00224466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -328,7 +418,10 @@ define <8 x double> @shuffle_v8f64_00224
define <8 x double> @shuffle_v8f64_10325476(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10325476:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x double> %shuffle
@@ -337,7 +430,10 @@ define <8 x double> @shuffle_v8f64_10325
define <8 x double> @shuffle_v8f64_11335577(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_11335577:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x double> %shuffle
@@ -346,7 +442,10 @@ define <8 x double> @shuffle_v8f64_11335
define <8 x double> @shuffle_v8f64_10235467(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -355,7 +454,10 @@ define <8 x double> @shuffle_v8f64_10235
define <8 x double> @shuffle_v8f64_10225466(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10225466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -364,8 +466,10 @@ define <8 x double> @shuffle_v8f64_10225
define <8 x double> @shuffle_v8f64_00015444(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00015444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -374,8 +478,10 @@ define <8 x double> @shuffle_v8f64_00015
define <8 x double> @shuffle_v8f64_00204644(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00204644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -384,8 +490,10 @@ define <8 x double> @shuffle_v8f64_00204
define <8 x double> @shuffle_v8f64_03004474(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_03004474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -394,8 +502,10 @@ define <8 x double> @shuffle_v8f64_03004
define <8 x double> @shuffle_v8f64_10004444(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10004444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -404,8 +514,10 @@ define <8 x double> @shuffle_v8f64_10004
define <8 x double> @shuffle_v8f64_22006446(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_22006446:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x double> %shuffle
@@ -414,8 +526,10 @@ define <8 x double> @shuffle_v8f64_22006
define <8 x double> @shuffle_v8f64_33307474(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_33307474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x double> %shuffle
@@ -424,8 +538,9 @@ define <8 x double> @shuffle_v8f64_33307
define <8 x double> @shuffle_v8f64_32104567(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_32104567:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -434,8 +549,10 @@ define <8 x double> @shuffle_v8f64_32104
define <8 x double> @shuffle_v8f64_00236744(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00236744:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -444,8 +561,10 @@ define <8 x double> @shuffle_v8f64_00236
define <8 x double> @shuffle_v8f64_00226644(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00226644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -454,7 +573,9 @@ define <8 x double> @shuffle_v8f64_00226
define <8 x double> @shuffle_v8f64_10324567(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_10324567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -463,7 +584,9 @@ define <8 x double> @shuffle_v8f64_10324
define <8 x double> @shuffle_v8f64_11334567(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_11334567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -472,7 +595,9 @@ define <8 x double> @shuffle_v8f64_11334
define <8 x double> @shuffle_v8f64_01235467(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -481,7 +606,9 @@ define <8 x double> @shuffle_v8f64_01235
define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_01235466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -490,8 +617,10 @@ define <8 x double> @shuffle_v8f64_01235
define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_002u6u44:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm0[0,0,2,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x double> %shuffle
@@ -500,8 +629,10 @@ define <8 x double> @shuffle_v8f64_002u6
define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_00uu66uu:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm1
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -510,7 +641,9 @@ define <8 x double> @shuffle_v8f64_00uu6
define <8 x double> @shuffle_v8f64_103245uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_103245uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,0,3,2]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -519,7 +652,9 @@ define <8 x double> @shuffle_v8f64_10324
define <8 x double> @shuffle_v8f64_1133uu67(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_1133uu67:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm0[1,1,3,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x double> %shuffle
@@ -528,7 +663,9 @@ define <8 x double> @shuffle_v8f64_1133u
define <8 x double> @shuffle_v8f64_0uu354uu(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_0uu354uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x double> %shuffle
@@ -537,7 +674,9 @@ define <8 x double> @shuffle_v8f64_0uu35
define <8 x double> @shuffle_v8f64_uuu3uu66(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_uuu3uu66:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x double> %shuffle
@@ -546,9 +685,16 @@ define <8 x double> @shuffle_v8f64_uuu3u
define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_c348cda0:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vbroadcastsd %xmm1, %ymm4
+; ALL-NEXT: vblendpd {{.*#+}} ymm4 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3]
+; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 12, i32 3, i32 4, i32 8, i32 12, i32 13, i32 10, i32 0>
ret <8 x double> %shuffle
@@ -557,9 +703,17 @@ define <8 x double> @shuffle_v8f64_c348c
define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_f511235a:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2pd %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm0[0],ymm2[1],ymm0[2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,3,1,3]
+; ALL-NEXT: vmovddup {{.*#+}} ymm4 = ymm1[0,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3]
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
ret <8 x double> %shuffle
@@ -577,8 +731,9 @@ define <8 x i64> @shuffle_v8i64_00000000
define <8 x i64> @shuffle_v8i64_00000010(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000010:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 0>
ret <8 x i64> %shuffle
@@ -587,8 +742,9 @@ define <8 x i64> @shuffle_v8i64_00000010
define <8 x i64> @shuffle_v8i64_00000200(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00000200:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 2, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -597,8 +753,9 @@ define <8 x i64> @shuffle_v8i64_00000200
define <8 x i64> @shuffle_v8i64_00003000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00003000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 3, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -607,8 +764,11 @@ define <8 x i64> @shuffle_v8i64_00003000
define <8 x i64> @shuffle_v8i64_00040000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00040000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 4, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -617,8 +777,11 @@ define <8 x i64> @shuffle_v8i64_00040000
define <8 x i64> @shuffle_v8i64_00500000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00500000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,0]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 5, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -627,8 +790,11 @@ define <8 x i64> @shuffle_v8i64_00500000
define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_06000000:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,0]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -637,11 +803,11 @@ define <8 x i64> @shuffle_v8i64_06000000
define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_70000000:
; ALL: # BB#0:
-; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1
-; ALL-NEXT: movl $7, %eax
-; ALL-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
-; ALL-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,0,0,0]
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
ret <8 x i64> %shuffle
@@ -650,7 +816,10 @@ define <8 x i64> @shuffle_v8i64_70000000
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01014545:
; ALL: # BB#0:
-; ALL-NEXT: vshufi64x2 $160, %zmm0, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -659,8 +828,9 @@ define <8 x i64> @shuffle_v8i64_01014545
define <8 x i64> @shuffle_v8i64_00112233(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00112233:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i64> %shuffle
@@ -669,8 +839,9 @@ define <8 x i64> @shuffle_v8i64_00112233
define <8 x i64> @shuffle_v8i64_00001111(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00001111:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
ret <8 x i64> %shuffle
@@ -679,7 +850,11 @@ define <8 x i64> @shuffle_v8i64_00001111
define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_81a3c5e7:
; ALL: # BB#0:
-; ALL-NEXT: vshufpd $170, %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
ret <8 x i64> %shuffle
@@ -688,9 +863,10 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7
define <8 x i64> @shuffle_v8i64_08080808(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08080808:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 0, i32 8, i32 0, i32 8>
ret <8 x i64> %shuffle
@@ -699,9 +875,15 @@ define <8 x i64> @shuffle_v8i64_08080808
define <8 x i64> @shuffle_v8i64_08084c4c(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08084c4c:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 0, i32 8, i32 4, i32 12, i32 4, i32 12>
ret <8 x i64> %shuffle
@@ -710,9 +892,13 @@ define <8 x i64> @shuffle_v8i64_08084c4c
define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_8823cc67:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; ALL-NEXT: vpbroadcastq %xmm3, %ymm3
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; ALL-NEXT: vpbroadcastq %xmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -721,9 +907,13 @@ define <8 x i64> @shuffle_v8i64_8823cc67
define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_9832dc76:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -732,9 +922,13 @@ define <8 x i64> @shuffle_v8i64_9832dc76
define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_9810dc54:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm2
+; ALL-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -743,9 +937,15 @@ define <8 x i64> @shuffle_v8i64_9810dc54
define <8 x i64> @shuffle_v8i64_08194c5d(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08194c5d:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
ret <8 x i64> %shuffle
@@ -754,9 +954,15 @@ define <8 x i64> @shuffle_v8i64_08194c5d
define <8 x i64> @shuffle_v8i64_2a3b6e7f(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_2a3b6e7f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
ret <8 x i64> %shuffle
@@ -765,9 +971,13 @@ define <8 x i64> @shuffle_v8i64_2a3b6e7f
define <8 x i64> @shuffle_v8i64_08192a3b(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08192a3b:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,2,2,3]
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
ret <8 x i64> %shuffle
@@ -776,9 +986,11 @@ define <8 x i64> @shuffle_v8i64_08192a3b
define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_08991abb:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,0,1,1]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,2,3,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x i64> %shuffle
@@ -787,9 +999,12 @@ define <8 x i64> @shuffle_v8i64_08991abb
define <8 x i64> @shuffle_v8i64_091b2d3f(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_091b2d3f:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,1,3,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x i64> %shuffle
@@ -798,9 +1013,11 @@ define <8 x i64> @shuffle_v8i64_091b2d3f
define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_09ab1def:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x i64> %shuffle
@@ -809,7 +1026,10 @@ define <8 x i64> @shuffle_v8i64_09ab1def
define <8 x i64> @shuffle_v8i64_00014445(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00014445:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $64, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 4, i32 4, i32 4, i32 5>
ret <8 x i64> %shuffle
@@ -818,7 +1038,10 @@ define <8 x i64> @shuffle_v8i64_00014445
define <8 x i64> @shuffle_v8i64_00204464(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00204464:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $32, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 4, i32 6, i32 4>
ret <8 x i64> %shuffle
@@ -827,7 +1050,10 @@ define <8 x i64> @shuffle_v8i64_00204464
define <8 x i64> @shuffle_v8i64_03004744(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_03004744:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $12, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -836,7 +1062,10 @@ define <8 x i64> @shuffle_v8i64_03004744
define <8 x i64> @shuffle_v8i64_10005444(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10005444:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $1, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -845,7 +1074,10 @@ define <8 x i64> @shuffle_v8i64_10005444
define <8 x i64> @shuffle_v8i64_22006644(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_22006644:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $10, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -854,7 +1086,10 @@ define <8 x i64> @shuffle_v8i64_22006644
define <8 x i64> @shuffle_v8i64_33307774(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_33307774:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $63, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 7, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -863,7 +1098,10 @@ define <8 x i64> @shuffle_v8i64_33307774
define <8 x i64> @shuffle_v8i64_32107654(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_32107654:
; ALL: # BB#0:
-; ALL-NEXT: vpermq $27, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x i64> %shuffle
@@ -872,7 +1110,10 @@ define <8 x i64> @shuffle_v8i64_32107654
define <8 x i64> @shuffle_v8i64_00234467(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00234467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $136, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 4, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -881,7 +1122,10 @@ define <8 x i64> @shuffle_v8i64_00234467
define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00224466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $0, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -890,7 +1134,10 @@ define <8 x i64> @shuffle_v8i64_00224466
define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10325476:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $85, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x i64> %shuffle
@@ -899,7 +1146,10 @@ define <8 x i64> @shuffle_v8i64_10325476
define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_11335577:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $255, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
ret <8 x i64> %shuffle
@@ -908,7 +1158,10 @@ define <8 x i64> @shuffle_v8i64_11335577
define <8 x i64> @shuffle_v8i64_10235467(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $153, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -917,7 +1170,10 @@ define <8 x i64> @shuffle_v8i64_10235467
define <8 x i64> @shuffle_v8i64_10225466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10225466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $17, %zmm0, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,2,2]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,2,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 2, i32 2, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -926,8 +1182,10 @@ define <8 x i64> @shuffle_v8i64_10225466
define <8 x i64> @shuffle_v8i64_00015444(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00015444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 5, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -936,8 +1194,10 @@ define <8 x i64> @shuffle_v8i64_00015444
define <8 x i64> @shuffle_v8i64_00204644(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00204644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 0, i32 4, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -946,8 +1206,10 @@ define <8 x i64> @shuffle_v8i64_00204644
define <8 x i64> @shuffle_v8i64_03004474(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_03004474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,3,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 4, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -956,8 +1218,10 @@ define <8 x i64> @shuffle_v8i64_03004474
define <8 x i64> @shuffle_v8i64_10004444(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10004444:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[1,0,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -966,8 +1230,10 @@ define <8 x i64> @shuffle_v8i64_10004444
define <8 x i64> @shuffle_v8i64_22006446(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_22006446:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,2,0,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 2, i32 0, i32 0, i32 6, i32 4, i32 4, i32 6>
ret <8 x i64> %shuffle
@@ -976,8 +1242,10 @@ define <8 x i64> @shuffle_v8i64_22006446
define <8 x i64> @shuffle_v8i64_33307474(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_33307474:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,3,3,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,3,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 0, i32 7, i32 4, i32 7, i32 4>
ret <8 x i64> %shuffle
@@ -986,8 +1254,9 @@ define <8 x i64> @shuffle_v8i64_33307474
define <8 x i64> @shuffle_v8i64_32104567(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_32104567:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -996,8 +1265,10 @@ define <8 x i64> @shuffle_v8i64_32104567
define <8 x i64> @shuffle_v8i64_00236744(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00236744:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,2,3]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 6, i32 7, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1006,8 +1277,10 @@ define <8 x i64> @shuffle_v8i64_00236744
define <8 x i64> @shuffle_v8i64_00226644(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00226644:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 6, i32 6, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1016,7 +1289,9 @@ define <8 x i64> @shuffle_v8i64_00226644
define <8 x i64> @shuffle_v8i64_10324567(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_10324567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $165, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1025,7 +1300,9 @@ define <8 x i64> @shuffle_v8i64_10324567
define <8 x i64> @shuffle_v8i64_11334567(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_11334567:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $175, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1034,7 +1311,9 @@ define <8 x i64> @shuffle_v8i64_11334567
define <8 x i64> @shuffle_v8i64_01235467(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01235467:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $154, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1043,7 +1322,9 @@ define <8 x i64> @shuffle_v8i64_01235467
define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_01235466:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $26, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,0,2,2]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1052,8 +1333,10 @@ define <8 x i64> @shuffle_v8i64_01235466
define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_002u6u44:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,0,0]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
ret <8 x i64> %shuffle
@@ -1062,8 +1345,10 @@ define <8 x i64> @shuffle_v8i64_002u6u44
define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_00uu66uu:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm1
-; ALL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; ALL-NEXT: vpbroadcastq %xmm0, %ymm1
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1072,7 +1357,9 @@ define <8 x i64> @shuffle_v8i64_00uu66uu
define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_103245uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $37, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1081,7 +1368,9 @@ define <8 x i64> @shuffle_v8i64_103245uu
define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_1133uu67:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $143, %zmm0, %zmm0
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[2,3,2,3,6,7,6,7]
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x i64> %shuffle
@@ -1090,7 +1379,9 @@ define <8 x i64> @shuffle_v8i64_1133uu67
define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_0uu354uu:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $24, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
ret <8 x i64> %shuffle
@@ -1099,7 +1390,9 @@ define <8 x i64> @shuffle_v8i64_0uu354uu
define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_uuu3uu66:
; ALL: # BB#0:
-; ALL-NEXT: vpermilpd $8, %zmm0, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
ret <8 x i64> %shuffle
@@ -1108,9 +1401,15 @@ define <8 x i64> @shuffle_v8i64_uuu3uu66
define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_6caa87e5:
; ALL: # BB#0:
-; ALL-NEXT: vmovdqa64 {{.*}}(%rip), %zmm2
-; ALL-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; ALL-NEXT: vmovaps %zmm2, %zmm0
+; ALL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; ALL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; ALL-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; ALL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; ALL-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 6, i32 12, i32 10, i32 10, i32 8, i32 7, i32 14, i32 5>
ret <8 x i64> %shuffle
More information about the llvm-commits
mailing list