[llvm] r290947 - [X86] Attempt to pre-truncate arithmetic operations if useful
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 4 00:05:43 PST 2017
Author: rksimon
Date: Wed Jan 4 02:05:42 2017
New Revision: 290947
URL: http://llvm.org/viewvc/llvm-project?rev=290947&view=rev
Log:
[X86] Attempt to pre-truncate arithmetic operations if useful
In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types.
This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold).
Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs.
I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need.
Differential Revision: https://reviews.llvm.org/D28219
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll
llvm/trunk/test/CodeGen/X86/i64-to-float.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=290947&r1=290946&r2=290947&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jan 4 02:05:42 2017
@@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N
return SDValue();
}
+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDLoc &DL) {
+ assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+ SDValue Src = N->getOperand(0);
+ unsigned Opcode = Src.getOpcode();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
+ // TODO: Add extra cases where we can truncate both inputs for the
+ // cost of one (or none).
+ // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+ if (Op0 == Op1)
+ return true;
+
+ SDValue BC0 = peekThroughOneUseBitcasts(Op0);
+ SDValue BC1 = peekThroughOneUseBitcasts(Op1);
+ return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
+ ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+ };
+
+ auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+ return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+ };
+
+ // Don't combine if the operation has other uses.
+ if (!N->isOnlyUserOf(Src.getNode()))
+ return SDValue();
+
+ // Only support vector truncation for now.
+ // TODO: i64 scalar math would benefit as well.
+ if (!VT.isVector())
+ return SDValue();
+
+ // In most cases its only worth pre-truncating if we're only facing the cost
+ // of one truncation.
+ // i.e. if one of the inputs will constant fold or the input is repeated.
+ switch (Opcode) {
+ case ISD::AND:
+ case ISD::XOR:
+ case ISD::OR: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+
+ case ISD::MUL:
+ // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+ // better to truncate if we have the chance.
+ if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
+ !TLI.isOperationLegal(Opcode, SrcVT))
+ return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+ LLVM_FALLTHROUGH;
+ case ISD::ADD: {
+ SDValue Op0 = Src.getOperand(0);
+ SDValue Op1 = Src.getOperand(1);
+ if (TLI.isOperationLegal(Opcode, VT) &&
+ IsRepeatedOpOrOneUseConstant(Op0, Op1))
+ return TruncateArithmetic(Op0, Op1);
+ break;
+ }
+ }
+
+ return SDValue();
+}
+
/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
static SDValue
combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N
SDValue Src = N->getOperand(0);
SDLoc DL(N);
+ // Attempt to pre-truncate inputs to arithmetic ops instead.
+ if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+ return V;
+
// Try to detect AVG pattern first.
if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
return Avg;
Modified: llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll?rev=290947&r1=290946&r2=290947&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-any_extend_load.ll Wed Jan 4 02:05:42 2017
@@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x
define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
; KNL-LABEL: any_extend_load_v8i32:
; KNL: # BB#0:
-; KNL-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; KNL-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
+; KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; KNL-NEXT: vmovq %xmm0, (%rdi)
; KNL-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/i64-to-float.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/i64-to-float.ll?rev=290947&r1=290946&r2=290947&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/i64-to-float.ll (original)
+++ llvm/trunk/test/CodeGen/X86/i64-to-float.ll Wed Jan 4 02:05:42 2017
@@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f
define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
; X64-SSE: # BB#0:
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
; X64-AVX: # BB#0:
-; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
@@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f3
define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
; X32-SSE: # BB#0:
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-SSE-NEXT: andps {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X32-SSE-NEXT: retl
;
; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
; X32-AVX: # BB#0:
-; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0
; X32-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X32-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-AVX-NEXT: vandps {{\.LCPI.*}}, %xmm0, %xmm0
; X32-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X32-AVX-NEXT: vzeroupper
; X32-AVX-NEXT: retl
;
; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
; X64-SSE: # BB#0:
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE-NEXT: andps {{.*}}(%rip), %xmm0
; X64-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
; X64-AVX: # BB#0:
-; X64-AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; X64-AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX-NEXT: vzeroupper
; X64-AVX-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll?rev=290947&r1=290946&r2=290947&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll Wed Jan 4 02:05:42 2017
@@ -419,40 +419,31 @@ define <16 x i8> @trunc_add_v16i16_v16i8
define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v4i64_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: paddq %xmm2, %xmm0
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -462,52 +453,39 @@ define <4 x i32> @trunc_add_const_v4i64_
define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT: paddq %xmm0, %xmm4
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -515,14 +493,14 @@ define <8 x i16> @trunc_add_const_v8i64_
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -532,41 +510,38 @@ define <8 x i16> @trunc_add_const_v8i64_
define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v8i32_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -576,17 +551,6 @@ define <8 x i16> @trunc_add_const_v8i32_
define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i64_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
-; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: paddq %xmm8, %xmm0
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm1
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm3
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm4
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm5
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm6
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -603,50 +567,37 @@ define <16 x i8> @trunc_add_const_v16i64
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm4
-; AVX1-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpackuswb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpackuswb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
-; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -666,37 +617,35 @@ define <16 x i8> @trunc_add_const_v16i64
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -706,10 +655,6 @@ define <16 x i8> @trunc_add_const_v16i64
define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i32_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm2
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -718,31 +663,27 @@ define <16 x i8> @trunc_add_const_v16i32
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -752,13 +693,14 @@ define <16 x i8> @trunc_add_const_v16i32
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -768,56 +710,54 @@ define <16 x i8> @trunc_add_const_v16i32
define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_add_const_v16i16_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -1676,69 +1616,39 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(
; AVX1-LABEL: trunc_mul_v4i64_v4i32:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512F-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512F-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
@@ -1757,46 +1667,17 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(
define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
; SSE-LABEL: trunc_mul_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm4, %xmm8
-; SSE-NEXT: movdqa %xmm4, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm0, %xmm9
-; SSE-NEXT: paddq %xmm8, %xmm9
-; SSE-NEXT: psllq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: paddq %xmm9, %xmm0
-; SSE-NEXT: movdqa %xmm1, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm5, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm1, %xmm4
-; SSE-NEXT: paddq %xmm8, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm5, %xmm1
-; SSE-NEXT: paddq %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm6, %xmm4
-; SSE-NEXT: movdqa %xmm6, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm2, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm7, %xmm4
-; SSE-NEXT: movdqa %xmm7, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm5, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -1808,111 +1689,68 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: pmullw %xmm6, %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
-; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1,2,3],xmm1[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1,2,3],xmm3[4],xmm5[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2,3],xmm0[4],xmm5[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4
-; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512F-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512F-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512F-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
@@ -2186,104 +2024,60 @@ define <16 x i8> @trunc_mul_v16i64_v16i8
;
; AVX2-LABEL: trunc_mul_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm8
-; AVX2-NEXT: vpmuludq %ymm5, %ymm8, %ymm8
-; AVX2-NEXT: vpsrlq $32, %ymm5, %ymm9
-; AVX2-NEXT: vpmuludq %ymm9, %ymm1, %ymm9
-; AVX2-NEXT: vpaddq %ymm8, %ymm9, %ymm8
-; AVX2-NEXT: vpsllq $32, %ymm8, %ymm8
-; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm8, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm5
-; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm8
-; AVX2-NEXT: vpmuludq %ymm8, %ymm0, %ymm8
-; AVX2-NEXT: vpaddq %ymm5, %ymm8, %ymm5
-; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm5, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm7, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm3, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
-; AVX2-NEXT: vpmuludq %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm6, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm2, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm4
-; AVX512F-NEXT: vpmuludq %zmm3, %zmm4, %zmm4
-; AVX512F-NEXT: vpsrlq $32, %zmm3, %zmm5
-; AVX512F-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
-; AVX512F-NEXT: vpaddq %zmm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpsllq $32, %zmm4, %zmm4
-; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm3
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm3, %zmm3
-; AVX512F-NEXT: vpsrlq $32, %zmm2, %zmm4
-; AVX512F-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
-; AVX512F-NEXT: vpaddq %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddq %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
+; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm4, %zmm4
-; AVX512BW-NEXT: vpsrlq $32, %zmm3, %zmm5
-; AVX512BW-NEXT: vpmuludq %zmm5, %zmm1, %zmm5
-; AVX512BW-NEXT: vpaddq %zmm4, %zmm5, %zmm4
-; AVX512BW-NEXT: vpsllq $32, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm2, %zmm3, %zmm3
-; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmuludq %zmm4, %zmm0, %zmm4
-; AVX512BW-NEXT: vpaddq %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT: vpsllq $32, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
+; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2
+; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: retq
@@ -2479,108 +2273,33 @@ define <4 x i32> @trunc_mul_const_v4i64_
; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
-; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
+; AVX512: # BB#0:
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
-; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: psrlq $32, %xmm0
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: psllq $32, %xmm0
-; SSE-NEXT: paddq %xmm5, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm5
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: psrlq $32, %xmm1
-; SSE-NEXT: pmuludq %xmm4, %xmm1
-; SSE-NEXT: psllq $32, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5]
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: psllq $32, %xmm2
-; SSE-NEXT: paddq %xmm5, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [6,7]
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: paddq %xmm5, %xmm3
+ %2 = trunc <4 x i64> %1 to <4 x i32>
+ ret <4 x i32> %2
+}
+
+define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
+; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
+; SSE: # BB#0:
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
@@ -2592,64 +2311,28 @@ define <8 x i16> @trunc_mul_const_v8i64_
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm2
+; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: movl $1, %eax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
-; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,6,7]
-; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3]
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -2657,37 +2340,15 @@ define <8 x i16> @trunc_mul_const_v8i64_
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512F-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512DQ-NEXT: retq
+; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
+; AVX512: # BB#0:
+; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
ret <8 x i16> %2
@@ -2696,55 +2357,38 @@ define <8 x i16> @trunc_mul_const_v8i64_
define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,1,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -2907,34 +2551,12 @@ define <16 x i8> @trunc_mul_const_v16i64
;
; AVX2-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,6,7]
-; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm1, %ymm5, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3]
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15]
-; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,10,11]
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpaddq %ymm2, %ymm5, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
@@ -2943,8 +2565,10 @@ define <16 x i8> @trunc_mul_const_v16i64
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
@@ -2955,50 +2579,30 @@ define <16 x i8> @trunc_mul_const_v16i64
;
; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT: vpsrlq $32, %zmm1, %zmm1
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpsllq $32, %zmm1, %zmm1
-; AVX512F-NEXT: vpaddq %zmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
-; AVX512F-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpsllq $32, %zmm0, %zmm0
-; AVX512F-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsllq $32, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddq %zmm1, %zmm3, %zmm1
-; AVX512BW-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllq $32, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmullq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: retq
@@ -3073,15 +2677,15 @@ define <16 x i8> @trunc_mul_const_v16i32
;
; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vzeroupper
@@ -3547,36 +3151,31 @@ define <16 x i8> @trunc_and_v16i16_v16i8
define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v4i64_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: andps {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -3586,30 +3185,23 @@ define <4 x i32> @trunc_and_const_v4i64_
define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE-NEXT: andpd {{.*}}(%rip), %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -3620,13 +3212,12 @@ define <8 x i16> @trunc_and_const_v8i64_
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -3634,14 +3225,14 @@ define <8 x i16> @trunc_and_const_v8i64_
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -3651,40 +3242,38 @@ define <8 x i16> @trunc_and_const_v8i64_
define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v8i32_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3694,41 +3283,27 @@ define <8 x i16> @trunc_and_const_v8i32_
define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i64_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
-; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE-NEXT: pand {{.*}}(%rip), %xmm4
-; SSE-NEXT: pand {{.*}}(%rip), %xmm5
-; SSE-NEXT: pand {{.*}}(%rip), %xmm6
-; SSE-NEXT: pand {{.*}}(%rip), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm9, %xmm7
-; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SSE-NEXT: pand %xmm8, %xmm7
+; SSE-NEXT: pand %xmm8, %xmm6
; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm9, %xmm5
-; SSE-NEXT: pand %xmm9, %xmm4
+; SSE-NEXT: pand %xmm8, %xmm5
+; SSE-NEXT: pand %xmm8, %xmm4
; SSE-NEXT: packuswb %xmm5, %xmm4
; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm9, %xmm3
-; SSE-NEXT: pand %xmm9, %xmm2
+; SSE-NEXT: pand %xmm8, %xmm3
+; SSE-NEXT: pand %xmm8, %xmm2
; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm9, %xmm1
-; SSE-NEXT: pand %xmm9, %xmm8
+; SSE-NEXT: pand %xmm8, %xmm1
; SSE-NEXT: pand %xmm8, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
@@ -3749,15 +3324,12 @@ define <16 x i8> @trunc_and_const_v16i64
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -3777,37 +3349,35 @@ define <16 x i8> @trunc_and_const_v16i64
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3817,10 +3387,6 @@ define <16 x i8> @trunc_and_const_v16i64
define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i32_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE-NEXT: pand {{.*}}(%rip), %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -3829,12 +3395,11 @@ define <16 x i8> @trunc_and_const_v16i32
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
@@ -3845,13 +3410,12 @@ define <16 x i8> @trunc_and_const_v16i32
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -3861,13 +3425,14 @@ define <16 x i8> @trunc_and_const_v16i32
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3877,55 +3442,54 @@ define <16 x i8> @trunc_and_const_v16i32
define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_and_const_v16i16_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: pand {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4323,36 +3887,31 @@ define <16 x i8> @trunc_xor_v16i16_v16i8
define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: xorps {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4362,30 +3921,23 @@ define <4 x i32> @trunc_xor_const_v4i64_
define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT: pxor %xmm0, %xmm4
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE-NEXT: xorpd {{.*}}(%rip), %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -4396,13 +3948,12 @@ define <8 x i16> @trunc_xor_const_v8i64_
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -4410,14 +3961,14 @@ define <8 x i16> @trunc_xor_const_v8i64_
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4427,40 +3978,38 @@ define <8 x i16> @trunc_xor_const_v8i64_
define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4470,17 +4019,6 @@ define <8 x i16> @trunc_xor_const_v8i32_
define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
-; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: pxor %xmm8, %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm4
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm5
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm6
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -4497,14 +4035,11 @@ define <16 x i8> @trunc_xor_const_v16i64
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
@@ -4525,15 +4060,12 @@ define <16 x i8> @trunc_xor_const_v16i64
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -4553,37 +4085,35 @@ define <16 x i8> @trunc_xor_const_v16i64
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -4593,10 +4123,6 @@ define <16 x i8> @trunc_xor_const_v16i64
define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -4605,12 +4131,11 @@ define <16 x i8> @trunc_xor_const_v16i32
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
@@ -4621,13 +4146,12 @@ define <16 x i8> @trunc_xor_const_v16i32
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -4637,13 +4161,14 @@ define <16 x i8> @trunc_xor_const_v16i32
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpxord {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -4653,55 +4178,54 @@ define <16 x i8> @trunc_xor_const_v16i32
define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -5099,36 +4623,31 @@ define <16 x i8> @trunc_or_v16i16_v16i8(
define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v4i64_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: orps {{.*}}(%rip), %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE-NEXT: orps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
; AVX512: # BB#0:
-; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
%2 = trunc <4 x i64> %1 to <4 x i32>
@@ -5138,30 +4657,23 @@ define <4 x i32> @trunc_or_const_v4i64_v
define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v8i64_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm4
-; SSE-NEXT: pslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7]
-; SSE-NEXT: por %xmm0, %xmm4
-; SSE-NEXT: por {{.*}}(%rip), %xmm1
-; SSE-NEXT: por {{.*}}(%rip), %xmm2
-; SSE-NEXT: por {{.*}}(%rip), %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
+; SSE-NEXT: orpd {{.*}}(%rip), %xmm2
+; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
@@ -5172,13 +4684,12 @@ define <8 x i16> @trunc_or_const_v8i64_v
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v8i64_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
@@ -5186,14 +4697,14 @@ define <8 x i16> @trunc_or_const_v8i64_v
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
+; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
%2 = trunc <8 x i64> %1 to <8 x i16>
@@ -5203,40 +4714,38 @@ define <8 x i16> @trunc_or_const_v8i64_v
define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v8i32_v8i16:
; SSE: # BB#0:
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm1
; SSE-NEXT: pslld $16, %xmm1
; SSE-NEXT: psrad $16, %xmm1
; SSE-NEXT: pslld $16, %xmm0
; SSE-NEXT: psrad $16, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
; AVX1: # BB#0:
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
; AVX2: # BB#0:
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
; AVX512: # BB#0:
-; AVX512-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = trunc <8 x i32> %1 to <8 x i16>
@@ -5246,17 +4755,6 @@ define <8 x i16> @trunc_or_const_v8i32_v
define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i64_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: movd %rax, %xmm8
-; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7]
-; SSE-NEXT: por %xmm8, %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm1
-; SSE-NEXT: por {{.*}}(%rip), %xmm2
-; SSE-NEXT: por {{.*}}(%rip), %xmm3
-; SSE-NEXT: por {{.*}}(%rip), %xmm4
-; SSE-NEXT: por {{.*}}(%rip), %xmm5
-; SSE-NEXT: por {{.*}}(%rip), %xmm6
-; SSE-NEXT: por {{.*}}(%rip), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; SSE-NEXT: pand %xmm8, %xmm7
; SSE-NEXT: pand %xmm8, %xmm6
@@ -5273,14 +4771,11 @@ define <16 x i8> @trunc_or_const_v16i64_
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm3, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
; AVX1-NEXT: vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
; AVX1-NEXT: vandps %xmm5, %xmm4, %xmm4
@@ -5301,15 +4796,12 @@ define <16 x i8> @trunc_or_const_v16i64_
; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v16i64_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm2, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
@@ -5329,37 +4821,35 @@ define <16 x i8> @trunc_or_const_v16i64_
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512DQ-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
@@ -5369,10 +4859,6 @@ define <16 x i8> @trunc_or_const_v16i64_
define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i32_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm1
-; SSE-NEXT: por {{.*}}(%rip), %xmm2
-; SSE-NEXT: por {{.*}}(%rip), %xmm3
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE-NEXT: pand %xmm4, %xmm3
; SSE-NEXT: pand %xmm4, %xmm2
@@ -5381,12 +4867,11 @@ define <16 x i8> @trunc_or_const_v16i32_
; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vandps %xmm3, %xmm2, %xmm2
@@ -5397,13 +4882,12 @@ define <16 x i8> @trunc_or_const_v16i32_
; AVX1-NEXT: vandps %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm1, %ymm1
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -5413,13 +4897,14 @@ define <16 x i8> @trunc_or_const_v16i32_
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
; AVX512: # BB#0:
-; AVX512-NEXT: vpord {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: retq
%1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = trunc <16 x i32> %1 to <16 x i8>
@@ -5429,55 +4914,54 @@ define <16 x i8> @trunc_or_const_v16i32_
define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: trunc_or_const_v16i16_v16i8:
; SSE: # BB#0:
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm1
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm2, %xmm1
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
+; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
; AVX1: # BB#0:
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
; AVX2: # BB#0:
-; AVX2-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpor {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
%2 = trunc <16 x i16> %1 to <16 x i8>
@@ -5492,99 +4976,65 @@ define <4 x i32> @mul_add_const_v4i64_v4
; SSE-LABEL: mul_add_const_v4i64_v4i32:
; SSE: # BB#0:
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pmuludq %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm5
+; SSE-NEXT: pmuludq %xmm2, %xmm5
; SSE-NEXT: paddq %xmm4, %xmm5
; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm2
+; SSE-NEXT: pmuludq %xmm1, %xmm2
; SSE-NEXT: paddq %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm1
+; SSE-NEXT: psrlq $32, %xmm1
+; SSE-NEXT: pmuludq %xmm3, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
; SSE-NEXT: pmuludq %xmm0, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
+; SSE-NEXT: paddq %xmm1, %xmm4
; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm0
; SSE-NEXT: paddq %xmm4, %xmm0
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm0
-; SSE-NEXT: paddq {{.*}}(%rip), %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: mul_add_const_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: mul_add_const_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_add_const_v4i64_v4i32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_add_const_v4i64_v4i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mul_add_const_v4i64_v4i32:
; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = sext <4 x i32> %a0 to <4 x i64>
%2 = sext <4 x i32> %a1 to <4 x i64>
@@ -5597,91 +5047,67 @@ define <4 x i32> @mul_add_const_v4i64_v4
define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
; SSE-LABEL: mul_add_self_v4i64_v4i32:
; SSE: # BB#0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: psrad $31, %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: psrad $31, %xmm3
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: psrad $31, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrad $31, %xmm4
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pmuludq %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm5
; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm5
+; SSE-NEXT: pmuludq %xmm0, %xmm5
; SSE-NEXT: paddq %xmm4, %xmm5
; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm2
-; SSE-NEXT: paddq %xmm5, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pmuludq %xmm0, %xmm1
+; SSE-NEXT: paddq %xmm5, %xmm1
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: psrlq $32, %xmm0
+; SSE-NEXT: pmuludq %xmm3, %xmm0
+; SSE-NEXT: movdqa %xmm3, %xmm4
; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
+; SSE-NEXT: pmuludq %xmm2, %xmm4
+; SSE-NEXT: paddq %xmm0, %xmm4
; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm1
-; SSE-NEXT: paddq %xmm4, %xmm1
-; SSE-NEXT: paddq %xmm1, %xmm1
-; SSE-NEXT: paddq %xmm2, %xmm2
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: pmuludq %xmm2, %xmm3
+; SSE-NEXT: paddq %xmm4, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE-NEXT: paddd %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: mul_add_self_v4i64_v4i32:
; AVX1: # BB#0:
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm3
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
-; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmuldq %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: mul_add_self_v4i64_v4i32:
; AVX2: # BB#0:
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_add_self_v4i64_v4i32:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512F-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX512F-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_add_self_v4i64_v4i32:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512BW-NEXT: vpmovsxdq %xmm1, %ymm1
-; AVX512BW-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddq %ymm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: mul_add_self_v4i64_v4i32:
@@ -5689,9 +5115,8 @@ define <4 x i32> @mul_add_self_v4i64_v4i
; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512DQ-NEXT: vpmovsxdq %xmm1, %ymm1
; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpaddq %ymm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; AVX512DQ-NEXT: retq
%1 = sext <4 x i32> %a0 to <4 x i64>
%2 = sext <4 x i32> %a1 to <4 x i64>
More information about the llvm-commits
mailing list