[llvm] r250400 - AVX512: Implemented DAG lowering for shuff62x2/shufi62x2 instructions ( shuffle packed values at 128-bit granularity )
Igor Breger via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 15 06:29:08 PDT 2015
Author: ibreger
Date: Thu Oct 15 08:29:07 2015
New Revision: 250400
URL: http://llvm.org/viewvc/llvm-project?rev=250400&view=rev
Log:
AVX512: Implemented DAG lowering for shuff62x2/shufi62x2 instructions ( shuffle packed values at 128-bit granularity )
Differential Revision: http://reviews.llvm.org/D13648
Modified:
llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp
llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
Modified: llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp (original)
+++ llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp Thu Oct 15 08:29:07 2015
@@ -107,6 +107,51 @@ static void getZeroExtensionTypes(const
}
}
+#define CASE_VSHUF_COMMON(Inst, Suffix, src2) \
+ case X86::VSHUFF##Inst##Suffix##r##src2##i: \
+ case X86::VSHUFF##Inst##Suffix##r##src2##ik: \
+ case X86::VSHUFF##Inst##Suffix##r##src2##ikz: \
+ case X86::VSHUFI##Inst##Suffix##r##src2##i: \
+ case X86::VSHUFI##Inst##Suffix##r##src2##ik: \
+ case X86::VSHUFI##Inst##Suffix##r##src2##ikz:
+
+#define CASE_VSHUF(Inst) \
+ CASE_VSHUF_COMMON(Inst, Z, r) \
+ CASE_VSHUF_COMMON(Inst, Z, m) \
+ CASE_VSHUF_COMMON(Inst, Z256, r) \
+ CASE_VSHUF_COMMON(Inst, Z256, m) \
+
+/// \brief Extracts the types and if it has memory operand for a given
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction.
+static void getVSHUF64x2FamilyInfo(const MCInst *MI, MVT &VT, bool &HasMemOp) {
+ HasMemOp = false;
+ switch (MI->getOpcode()) {
+ default:
+ llvm_unreachable("Unknown VSHUF64x2 family instructions.");
+ break;
+ CASE_VSHUF_COMMON(64X2, Z, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF_COMMON(64X2, Z, r)
+ VT = MVT::v8i64;
+ break;
+ CASE_VSHUF_COMMON(64X2, Z256, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF_COMMON(64X2, Z256, r)
+ VT = MVT::v4i64;
+ break;
+ CASE_VSHUF_COMMON(32X4, Z, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF_COMMON(32X4, Z, r)
+ VT = MVT::v16i32;
+ break;
+ CASE_VSHUF_COMMON(32X4, Z256, m)
+ HasMemOp = true; // FALL THROUGH.
+ CASE_VSHUF_COMMON(32X4, Z256, r)
+ VT = MVT::v8i32;
+ break;
+ }
+}
+
//===----------------------------------------------------------------------===//
// Top Level Entrypoint
//===----------------------------------------------------------------------===//
@@ -726,7 +771,25 @@ bool llvm::EmitAnyX86InstComments(const
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
break;
-
+ CASE_VSHUF(64X2)
+ CASE_VSHUF(32X4) {
+ MVT VT;
+ bool HasMemOp;
+ unsigned NumOp = MI->getNumOperands();
+ getVSHUF64x2FamilyInfo(MI, VT, HasMemOp);
+ decodeVSHUF64x2FamilyMask(VT, MI->getOperand(NumOp - 1).getImm(),
+ ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ if (HasMemOp) {
+ assert((NumOp >= 8) && "Expected at least 8 operands!");
+ Src1Name = getRegName(MI->getOperand(NumOp - 7).getReg());
+ } else {
+ assert((NumOp >= 4) && "Expected at least 4 operands!");
+ Src2Name = getRegName(MI->getOperand(NumOp - 2).getReg());
+ Src1Name = getRegName(MI->getOperand(NumOp - 3).getReg());
+ }
+ break;
+ }
case X86::UNPCKLPDrr:
case X86::VUNPCKLPDrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.cpp Thu Oct 15 08:29:07 2015
@@ -264,6 +264,26 @@ void DecodeUNPCKLMask(MVT VT, SmallVecto
}
}
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumElementsInLane = 128 / VT.getScalarSizeInBits();
+ unsigned ControlBitsMask = NumLanes - 1;
+ unsigned NumControlBits = NumLanes / 2;
+
+ for (unsigned l = 0; l != NumLanes; ++l) {
+ unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
+ // We actually need the other source.
+ if (l >= NumLanes / 2)
+ LaneMask += NumLanes;
+ for (unsigned i = 0; i != NumElementsInLane; ++i)
+ ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
+ }
+}
+
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask) {
unsigned HalfSize = VT.getVectorNumElements() / 2;
Modified: llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h (original)
+++ llvm/trunk/lib/Target/X86/Utils/X86ShuffleDecode.h Thu Oct 15 08:29:07 2015
@@ -86,6 +86,11 @@ void DecodeBLENDMask(MVT VT, unsigned Im
void DecodeVPERM2X128Mask(MVT VT, unsigned Imm,
SmallVectorImpl<int> &ShuffleMask);
+/// \brief Decode a shuffle packed values at 128-bit granularity
+/// immediate mask into a shuffle mask.
+void decodeVSHUF64x2FamilyMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask);
+
/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
/// No VT provided since it only works on 256-bit, 4 element vectors.
void DecodeVPERMMask(unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Oct 15 08:29:07 2015
@@ -10747,6 +10747,42 @@ static SDValue lower256BitVectorShuffle(
}
}
+/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
+static SDValue lowerV4X128VectorShuffle(SDLoc DL, MVT VT,
+ ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ SelectionDAG &DAG) {
+ assert(VT.getScalarSizeInBits() == 64 &&
+ "Unexpected element type size for 128bit shuffle.");
+
+ // To handle 256 bit vector requires VLX and most probably
+ // function lowerV2X128VectorShuffle() is better solution.
+ assert(VT.getSizeInBits() == 512 &&
+ "Unexpected vector size for 128bit shuffle.");
+
+ SmallVector<int, 4> WidenedMask;
+ if (!canWidenShuffleElements(Mask, WidenedMask))
+ return SDValue();
+
+ // Form a 128-bit permutation.
+ // Convert the 64-bit shuffle mask selection values into 128-bit selection
+ // bits defined by a vshuf64x2 instruction's immediate control byte.
+ unsigned PermMask = 0, Imm = 0;
+ unsigned ControlBitsNum = WidenedMask.size() / 2;
+
+ for (int i = 0, Size = WidenedMask.size(); i < Size; ++i) {
+ if (WidenedMask[i] == SM_SentinelZero)
+ return SDValue();
+
+ // Use first element in place of undef mask.
+ Imm = (WidenedMask[i] == SM_SentinelUndef) ? 0 : WidenedMask[i];
+ PermMask |= (Imm % WidenedMask.size()) << (i * ControlBitsNum);
+ }
+
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+}
+
static SDValue lowerVectorShuffleWithPERMV(SDLoc DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
@@ -10774,6 +10810,10 @@ static SDValue lowerV8F64VectorShuffle(S
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
+ return Shuf128;
+
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
return Unpck;
@@ -10810,6 +10850,10 @@ static SDValue lowerV8I64VectorShuffle(S
ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
if (SDValue Unpck =
lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
return Unpck;
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Thu Oct 15 08:29:07 2015
@@ -4162,7 +4162,9 @@ define <16 x float>@test_int_x86_avx512_
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
@@ -4179,8 +4181,11 @@ define <8 x double>@test_int_x86_avx512_
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: vaddpd %zmm3, %zmm0, %zmm0
; CHECK-NEXT: retq
@@ -4200,7 +4205,9 @@ define <16 x i32>@test_int_x86_avx512_ma
; CHECK: ## BB#0:
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
; CHECK-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
@@ -4217,7 +4224,9 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT: ## zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
; CHECK-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
%res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Thu Oct 15 08:29:07 2015
@@ -1867,7 +1867,7 @@ define <4 x i32> @test_mask_xor_epi32_rm
define <4 x i32> @test_mask_xor_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
;CHECK-LABEL: test_mask_xor_epi32_rmbkz_128
- ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
+ ;CHECK: vpxord (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0xef,0x07]
%q = load i32, i32* %ptr_b
%vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
%b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -2299,7 +2299,7 @@ define <8 x float> @test_mm512_maskz_add
define <8 x float> @test_mm512_mask_add_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_add_ps_256
- ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
+ ;CHECK: vaddps %ymm1, %ymm0, %ymm2 {%k1}
%res = call <8 x float> @llvm.x86.avx512.mask.add.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
@@ -2321,7 +2321,7 @@ define <4 x float> @test_mm512_maskz_add
define <4 x float> @test_mm512_mask_add_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_add_ps_128
- ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
+ ;CHECK: vaddps %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.add.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
@@ -2343,7 +2343,7 @@ define <8 x float> @test_mm512_maskz_sub
define <8 x float> @test_mm512_mask_sub_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_sub_ps_256
- ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
+ ;CHECK: vsubps %ymm1, %ymm0, %ymm2 {%k1}
%res = call <8 x float> @llvm.x86.avx512.mask.sub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
@@ -2365,7 +2365,7 @@ define <4 x float> @test_mm512_maskz_sub
define <4 x float> @test_mm512_mask_sub_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_sub_ps_128
- ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
+ ;CHECK: vsubps %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.sub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
@@ -2387,7 +2387,7 @@ define <8 x float> @test_mm512_maskz_mul
define <8 x float> @test_mm512_mask_mul_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_mul_ps_256
- ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
+ ;CHECK: vmulps %ymm1, %ymm0, %ymm2 {%k1}
%res = call <8 x float> @llvm.x86.avx512.mask.mul.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
@@ -2409,7 +2409,7 @@ define <4 x float> @test_mm512_maskz_mul
define <4 x float> @test_mm512_mask_mul_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_mul_ps_128
- ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
+ ;CHECK: vmulps %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.mul.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
@@ -2431,7 +2431,7 @@ define <8 x float> @test_mm512_maskz_div
define <8 x float> @test_mm512_mask_div_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_div_ps_256
- ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
+ ;CHECK: vdivps %ymm1, %ymm0, %ymm2 {%k1}
%res = call <8 x float> @llvm.x86.avx512.mask.div.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
@@ -2453,7 +2453,7 @@ define <4 x float> @test_mm512_maskz_div
define <4 x float> @test_mm512_mask_div_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_div_ps_128
- ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
+ ;CHECK: vdivps %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
@@ -2475,7 +2475,7 @@ define <8 x float> @test_mm512_maskz_max
define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_max_ps_256
- ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
+ ;CHECK: vmaxps %ymm1, %ymm0, %ymm2 {%k1}
%res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
@@ -2497,7 +2497,7 @@ define <4 x float> @test_mm512_maskz_max
define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_max_ps_128
- ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
+ ;CHECK: vmaxps %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
@@ -2519,7 +2519,7 @@ define <8 x float> @test_mm512_maskz_min
define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_min_ps_256
- ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
+ ;CHECK: vminps %ymm1, %ymm0, %ymm2 {%k1}
%res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
ret <8 x float> %res
}
@@ -2541,7 +2541,7 @@ define <4 x float> @test_mm512_maskz_min
define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
;CHECK-LABEL: test_mm512_mask_min_ps_128
- ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
+ ;CHECK: vminps %xmm1, %xmm0, %xmm2 {%k1}
%res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
ret <4 x float> %res
}
@@ -2591,9 +2591,9 @@ declare <8 x float> @llvm.x86.avx512.mas
declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxsd %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2604,9 +2604,9 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxsd %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2617,9 +2617,9 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxsq %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
%res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2630,9 +2630,9 @@ define <2 x i64>@test_int_x86_avx512_mas
declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxsq %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
%res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2643,9 +2643,9 @@ define <4 x i64>@test_int_x86_avx512_mas
declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxud %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2,i8 %mask) {
%res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2656,9 +2656,9 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxud %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
%res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2669,9 +2669,9 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxuq %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
%res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2682,9 +2682,9 @@ define <2 x i64>@test_int_x86_avx512_mas
declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpmaxuq %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
%res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2695,9 +2695,9 @@ define <4 x i64>@test_int_x86_avx512_mas
declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminsd %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
%res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2708,9 +2708,9 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminsd %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
%res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2721,9 +2721,9 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminsq %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
%res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2734,9 +2734,9 @@ define <2 x i64>@test_int_x86_avx512_mas
declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminsq %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
%res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2747,9 +2747,9 @@ define <4 x i64>@test_int_x86_avx512_mas
declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminud %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) {
%res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask)
@@ -2760,9 +2760,9 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminud %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
%res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -2773,9 +2773,9 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_128
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminuq %xmm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
%res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -2786,9 +2786,9 @@ define <2 x i64>@test_int_x86_avx512_mas
declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_256
-; CHECK-NOT: call
+; CHECK-NOT: call
; CHECK: vpminuq %ymm
-; CHECK: {%k1}
+; CHECK: {%k1}
define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) {
%res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask)
@@ -2799,8 +2799,8 @@ define <4 x i64>@test_int_x86_avx512_mas
declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_128
-; CHECK-NOT: call
-; CHECK: kmov
+; CHECK-NOT: call
+; CHECK: kmov
; CHECK: vpermt2d %xmm{{.*}}{%k1}
; CHECK-NOT: {z}
define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
@@ -2813,8 +2813,8 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_128
-; CHECK-NOT: call
-; CHECK: kmov
+; CHECK-NOT: call
+; CHECK: kmov
; CHECK: vpermt2d %xmm{{.*}}{%k1} {z}
define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
%res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
@@ -2826,8 +2826,8 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_256
-; CHECK-NOT: call
-; CHECK: kmov
+; CHECK-NOT: call
+; CHECK: kmov
; CHECK: vpermt2d %ymm{{.*}}{%k1}
; CHECK-NOT: {z}
define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
@@ -2840,8 +2840,8 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_256
-; CHECK-NOT: call
-; CHECK: kmov
+; CHECK-NOT: call
+; CHECK: kmov
; CHECK: vpermt2d {{.*}}{%k1} {z}
define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
%res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
@@ -2853,9 +2853,9 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2pd %xmm{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd %xmm{{.*}}{%k1}
define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
%res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
@@ -2866,9 +2866,9 @@ define <2 x double>@test_int_x86_avx512_
declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2pd %ymm{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2pd %ymm{{.*}}{%k1}
define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
%res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -2879,9 +2879,9 @@ define <4 x double>@test_int_x86_avx512_
declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2ps %xmm{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps %xmm{{.*}}{%k1}
define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
%res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
@@ -2892,9 +2892,9 @@ define <4 x float>@test_int_x86_avx512_m
declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpermi2ps %ymm{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpermi2ps %ymm{{.*}}{%k1}
define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
%res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -2905,9 +2905,9 @@ define <8 x float>@test_int_x86_avx512_m
declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsq{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
define <2 x i64>@test_int_x86_avx512_mask_pabs_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
%res = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
@@ -2918,9 +2918,9 @@ define <2 x i64>@test_int_x86_avx512_mas
declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsq{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsq{{.*}}{%k1}
define <4 x i64>@test_int_x86_avx512_mask_pabs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
%res = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
@@ -2931,9 +2931,9 @@ define <4 x i64>@test_int_x86_avx512_mas
declare <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32>, <4 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsd{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
define <4 x i32>@test_int_x86_avx512_mask_pabs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
%res = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
%res1 = call <4 x i32> @llvm.x86.avx512.mask.pabs.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
@@ -2944,9 +2944,9 @@ define <4 x i32>@test_int_x86_avx512_mas
declare <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32>, <8 x i32>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vpabsd{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vpabsd{{.*}}{%k1}
define <8 x i32>@test_int_x86_avx512_mask_pabs_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
%res = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
%res1 = call <8 x i32> @llvm.x86.avx512.mask.pabs.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
@@ -2958,9 +2958,9 @@ define <8 x i32>@test_int_x86_avx512_mas
declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefpd{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
%res = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
@@ -2971,9 +2971,9 @@ define <2 x double>@test_int_x86_avx512_
declare <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefpd{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefpd{{.*}}{%k1}
define <4 x double>@test_int_x86_avx512_mask_scalef_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
%res = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
%res1 = call <4 x double> @llvm.x86.avx512.mask.scalef.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
@@ -2983,9 +2983,9 @@ define <4 x double>@test_int_x86_avx512_
declare <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefps{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
define <4 x float>@test_int_x86_avx512_mask_scalef_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
%res = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
%res1 = call <4 x float> @llvm.x86.avx512.mask.scalef.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
@@ -2995,9 +2995,9 @@ define <4 x float>@test_int_x86_avx512_m
declare <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vscalefps{{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vscalefps{{.*}}{%k1}
define <8 x float>@test_int_x86_avx512_mask_scalef_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
%res = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.scalef.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
@@ -3009,7 +3009,7 @@ declare <2 x double> @llvm.x86.avx512.ma
define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_128:
-; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1}
; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x15,0xc1]
%res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
%res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
@@ -3046,7 +3046,7 @@ declare <8 x float> @llvm.x86.avx512.mas
define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_256:
; CHECK: ## BB#0:
-; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK: vunpckhps %ymm1, %ymm0, %ymm2 {%k1}
; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x15,0xc1]
%res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
%res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
@@ -4457,9 +4457,9 @@ define <8 x float>@test_int_x86_avx512_m
declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscalepd {{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscalepd {{.*}}{%k1}
; CHECK: vrndscalepd
define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
%res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3)
@@ -4470,9 +4470,9 @@ define <2 x double>@test_int_x86_avx512_
declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_pd_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscalepd {{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscalepd {{.*}}{%k1}
; CHECK: vrndscalepd
define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
%res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3)
@@ -4483,9 +4483,9 @@ define <4 x double>@test_int_x86_avx512_
declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_128
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscaleps {{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscaleps {{.*}}{%k1}
; CHECK: vrndscaleps
define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
%res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3)
@@ -4497,9 +4497,9 @@ define <4 x float>@test_int_x86_avx512_m
declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8)
; CHECK-LABEL: @test_int_x86_avx512_mask_rndscale_ps_256
-; CHECK-NOT: call
-; CHECK: kmov
-; CHECK: vrndscaleps {{.*}}{%k1}
+; CHECK-NOT: call
+; CHECK: kmov
+; CHECK: vrndscaleps {{.*}}{%k1}
; CHECK: vrndscaleps
define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
%res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3)
@@ -4516,13 +4516,20 @@ define <8 x float>@test_int_x86_avx512_m
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: ## ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vshuff32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vaddps %ymm0, %ymm3, %ymm0
; CHECK-NEXT: retq
%res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4)
%res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1)
- %res2 = fadd <8 x float> %res, %res1
- ret <8 x float> %res2
+ %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4)
+ %res3 = fadd <8 x float> %res, %res1
+ %res4 = fadd <8 x float> %res2, %res3
+ ret <8 x float> %res4
}
declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8)
@@ -4533,13 +4540,20 @@ define <4 x double>@test_int_x86_avx512_
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT: ## ymm3 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vshuff64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
; CHECK-NEXT: retq
%res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4)
%res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1)
- %res2 = fadd <4 x double> %res, %res1
- ret <4 x double> %res2
+ %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4)
+ %res3 = fadd <4 x double> %res, %res1
+ %res4 = fadd <4 x double> %res2, %res3
+ ret <4 x double> %res4
}
declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8)
@@ -4550,7 +4564,9 @@ define <8 x i32>@test_int_x86_avx512_mas
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vshufi32x4 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retq
%res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4)
@@ -4567,7 +4583,9 @@ define <4 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: movzbl %dil, %eax
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT: ## ymm2 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vshufi64x2 $22, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: ## ymm0 = ymm0[0,1],ymm1[2,3]
; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; CHECK-NEXT: retq
%res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4)
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll Thu Oct 15 08:29:07 2015
@@ -139,14 +139,12 @@ define <8 x double> @shuffle_v8f64_70000
define <8 x double> @shuffle_v8f64_01014545(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5]
-; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,4,0,5,0,4,0,5,0]
-; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x double> %shuffle
@@ -1122,18 +1120,16 @@ define <8 x i64> @shuffle_v8i64_70000000
}
define <8 x i64> @shuffle_v8i64_01014545(<8 x i64> %a, <8 x i64> %b) {
-;
; AVX512F-LABEL: shuffle_v8i64_01014545:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5]
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_01014545:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,0,0,0,1,0,4,0,5,0,4,0,5,0]
-; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5]
; AVX512F-32-NEXT: retl
+
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 4, i32 5>
ret <8 x i64> %shuffle
}
@@ -2010,3 +2006,135 @@ define <8 x i64> @shuffle_v8i64_193b5d7f
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32><i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
ret <8 x i64> %shuffle
}
+
+define <8 x double> @test_vshuff64x2_512(<8 x double> %x, <8 x double> %x1) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_maskz(<8 x double> %x, <8 x double> %x1, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_maskz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_maskz:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
+; AVX512F-32-NEXT: vpandq .LCPI118_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+ %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <8 x i64> @test_vshufi64x2_512_mask(<8 x i64> %x, <8 x i64> %x1, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshufi64x2_512_mask:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm2, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshufi64x2_512_mask:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpmovsxwq %xmm2, %zmm2
+; AVX512F-32-NEXT: vpandq .LCPI119_0, %zmm2, %zmm2
+; AVX512F-32-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %y = shufflevector <8 x i64> %x, <8 x i64> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+ %res = select <8 x i1> %mask, <8 x i64> %y, <8 x i64> %x
+ ret <8 x i64> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem(<8 x double> %x, <8 x double> *%ptr) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_mem:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_mem:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %x1 = load <8 x double>,<8 x double> *%ptr,align 1
+ %res = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem_mask(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_mem_mask:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_mem_mask:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
+; AVX512F-32-NEXT: vpandq .LCPI121_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %x1 = load <8 x double>,<8 x double> *%ptr,align 1
+ %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+ %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> %x
+ ret <8 x double> %res
+}
+
+define <8 x double> @test_vshuff64x2_512_mem_maskz(<8 x double> %x, <8 x double> *%ptr, <8 x i1> %mask) nounwind {
+; AVX512F-LABEL: test_vshuff64x2_512_mem_maskz:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshuff64x2_512_mem_maskz:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vpmovsxwq %xmm1, %zmm1
+; AVX512F-32-NEXT: vpandq .LCPI122_0, %zmm1, %zmm1
+; AVX512F-32-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5],mem[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %x1 = load <8 x double>,<8 x double> *%ptr,align 1
+ %y = shufflevector <8 x double> %x, <8 x double> %x1, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 10, i32 11, i32 8, i32 9>
+ %res = select <8 x i1> %mask, <8 x double> %y, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
+define <16 x float> @test_vshuff32x4_512(<16 x float> %x, <16 x float> %x1) nounwind {
+; AVX512F-LABEL: test_vshuff32x4_512:
+; AVX512F: # BB#0:
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_vshuff32x4_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[2,3,0,1]
+; AVX512F-32-NEXT: retl
+ %res = shufflevector <16 x float> %x, <16 x float> %x1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+ ret <16 x float> %res
+}
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=250400&r1=250399&r2=250400&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Thu Oct 15 08:29:07 2015
@@ -213,8 +213,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a
; AVX512F-NEXT: movzbl %dil, %eax
; AVX512F-NEXT: kmovw %eax, %k1
; AVX512F-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u>
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
@@ -224,8 +223,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kmovb %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
-; VL_BW_DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,1,4,5,u,u,u,u>
-; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; VL_BW_DQ-NEXT: vshufi64x2 $8, %zmm0, %zmm0, %zmm0 # zmm0 = zmm0[0,1,4,5,0,1,0,1]
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: kmovb %k0, %eax
; VL_BW_DQ-NEXT: retq
More information about the llvm-commits
mailing list