[llvm] r327247 - [X86][MMX] Support MMX build vectors to avoid SSE usage (PR29222)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Mar 11 12:22:13 PDT 2018
Author: rksimon
Date: Sun Mar 11 12:22:13 2018
New Revision: 327247
URL: http://llvm.org/viewvc/llvm-project?rev=327247&view=rev
Log:
[X86][MMX] Support MMX build vectors to avoid SSE usage (PR29222)
64-bit MMX vector generation usually ends up lowering into SSE instructions before being spilled/reloaded as a MMX type.
This patch creates a MMX vector from MMX source values, taking the lowest element from each source and constructing broadcasts/build_vectors with direct calls to the MMX PUNPCKL/PSHUFW intrinsics.
We're missing a few consecutive load combines that could be handled in a future patch if that would be useful - my main interest here is just avoiding a lot of the MMX/SSE crossover.
Differential Revision: https://reviews.llvm.org/D43618
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/3dnow-intrinsics.ll
llvm/trunk/test/CodeGen/X86/mmx-build-vector.ll
llvm/trunk/test/CodeGen/X86/pr29222.ll
llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=327247&r1=327246&r2=327247&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Mar 11 12:22:13 2018
@@ -30991,6 +30991,79 @@ static SDValue combineCastedMaskArithmet
return SDValue();
}
+static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(N);
+ unsigned NumElts = N.getNumOperands();
+
+ auto *BV = cast<BuildVectorSDNode>(N);
+ SDValue Splat = BV->getSplatValue();
+
+ // Build MMX element from integer GPR or SSE float values.
+ auto CreateMMXElement = [&](SDValue V) {
+ if (V.isUndef())
+ return DAG.getUNDEF(MVT::x86mmx);
+ if (V.getValueType().isFloatingPoint()) {
+ if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
+ V = DAG.getBitcast(MVT::v2i64, V);
+ return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
+ }
+ V = DAG.getBitcast(MVT::i32, V);
+ } else {
+ V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
+ }
+ return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
+ };
+
+ // Convert build vector ops to MMX data in the bottom elements.
+ SmallVector<SDValue, 8> Ops;
+
+ // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
+ if (Splat) {
+ if (Splat.isUndef())
+ return DAG.getUNDEF(MVT::x86mmx);
+
+ Splat = CreateMMXElement(Splat);
+
+ if (Subtarget.hasSSE1()) {
+ // Unpack v8i8 to splat i8 elements to lowest 16-bits.
+ if (NumElts == 8)
+ Splat = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+ DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
+ Splat);
+
+ // Use PSHUFW to repeat 16-bit elements.
+ unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
+ DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
+ DAG.getConstant(ShufMask, DL, MVT::i8));
+ }
+ Ops.append(NumElts, Splat);
+ } else {
+ for (unsigned i = 0; i != NumElts; ++i)
+ Ops.push_back(CreateMMXElement(N.getOperand(i)));
+ }
+
+ // Use tree of PUNPCKLs to build up general MMX vector.
+ while (Ops.size() > 1) {
+ unsigned NumOps = Ops.size();
+ unsigned IntrinOp =
+ (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
+ : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
+ : Intrinsic::x86_mmx_punpcklbw));
+ SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
+ for (unsigned i = 0; i != NumOps; i += 2)
+ Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
+ Ops[i], Ops[i + 1]);
+ Ops.resize(NumOps / 2);
+ }
+
+ return Ops[0];
+}
+
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -31071,6 +31144,14 @@ static SDValue combineBitcast(SDNode *N,
}
}
+ // Detect bitcasts of 64-bit build vectors and convert to a
+ // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
+ // lowest element.
+ if (N0.getOpcode() == ISD::BUILD_VECTOR &&
+ (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
+ SrcVT == MVT::v8i8))
+ return createMMXBuildVector(N0, DAG, Subtarget);
+
// Detect bitcasts between element or subvector extraction to x86mmx.
if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
Modified: llvm/trunk/test/CodeGen/X86/3dnow-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/3dnow-intrinsics.ll?rev=327247&r1=327246&r2=327247&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/3dnow-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/3dnow-intrinsics.ll Sun Mar 11 12:22:13 2018
@@ -35,12 +35,11 @@ define <2 x i32> @test_pf2id(<2 x float>
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: pf2id {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: pf2id %mm1, %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -71,18 +70,15 @@ define <2 x float> @test_pfacc(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfacc {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfacc %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -113,18 +109,15 @@ define <2 x float> @test_pfadd(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfadd {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfadd %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -155,18 +148,15 @@ define <2 x i32> @test_pfcmpeq(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfcmpeq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfcmpeq %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: movl (%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %ebp, %esp
@@ -198,18 +188,15 @@ define <2 x i32> @test_pfcmpge(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfcmpge {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfcmpge %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: movl (%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %ebp, %esp
@@ -241,18 +228,15 @@ define <2 x i32> @test_pfcmpgt(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfcmpgt {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfcmpgt %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: movl (%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %ebp, %esp
@@ -284,18 +268,15 @@ define <2 x float> @test_pfmax(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfmax {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfmax %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -326,18 +307,15 @@ define <2 x float> @test_pfmin(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfmin {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfmin %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -368,18 +346,15 @@ define <2 x float> @test_pfmul(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfmul {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfmul %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -410,12 +385,11 @@ define <2 x float> @test_pfrcp(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: pfrcp {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: pfrcp %mm1, %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
@@ -445,18 +419,15 @@ define <2 x float> @test_pfrcpit1(<2 x f
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfrcpit1 {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfrcpit1 %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -487,18 +458,15 @@ define <2 x float> @test_pfrcpit2(<2 x f
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfrcpit2 {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfrcpit2 %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -529,12 +497,11 @@ define <2 x float> @test_pfrsqrt(<2 x fl
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: pfrsqrt {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: pfrsqrt %mm1, %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
@@ -564,18 +531,15 @@ define <2 x float> @test_pfrsqit1(<2 x f
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfrsqit1 {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfrsqit1 %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -606,18 +570,15 @@ define <2 x float> @test_pfsub(<2 x floa
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfsub {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfsub %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -648,18 +609,15 @@ define <2 x float> @test_pfsubr(<2 x flo
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfsubr {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfsubr %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -748,12 +706,11 @@ define <2 x i32> @test_pf2iw(<2 x float>
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: pf2iw {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: pf2iw %mm1, %mm0
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -784,18 +741,15 @@ define <2 x float> @test_pfnacc(<2 x flo
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfnacc {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfnacc %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -826,18 +780,15 @@ define <2 x float> @test_pfpnacc(<2 x fl
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $24, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 20(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 16(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: movq {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: pfpnacc {{[0-9]+}}(%esp), %mm0
-; X86-NEXT: movq %mm0, (%esp)
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 20(%ebp), %mm0
+; X86-NEXT: movd 16(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm2
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: pfpnacc %mm1, %mm2
+; X86-NEXT: movq %mm2, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
; X86-NEXT: movl %ebp, %esp
@@ -899,12 +850,11 @@ define <2 x float> @test_pswapdsf(<2 x f
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: flds 12(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: flds 8(%ebp)
-; X86-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-NEXT: pswapd {{[0-9]+}}(%esp), %mm0 # mm0 = mem[1,0]
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0]
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: flds {{[0-9]+}}(%esp)
; X86-NEXT: flds (%esp)
@@ -932,12 +882,11 @@ define <2 x i32> @test_pswapdsi(<2 x i32
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: andl $-8, %esp
-; X86-NEXT: subl $16, %esp
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: pswapd {{[0-9]+}}(%esp), %mm0 # mm0 = mem[1,0]
+; X86-NEXT: subl $8, %esp
+; X86-NEXT: movd 12(%ebp), %mm0
+; X86-NEXT: movd 8(%ebp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: pswapd %mm1, %mm0 # mm0 = mm1[1,0]
; X86-NEXT: movq %mm0, (%esp)
; X86-NEXT: movl (%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
Modified: llvm/trunk/test/CodeGen/X86/mmx-build-vector.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mmx-build-vector.ll?rev=327247&r1=327246&r2=327247&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mmx-build-vector.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mmx-build-vector.ll Sun Mar 11 12:22:13 2018
@@ -15,60 +15,24 @@ declare x86_mmx @llvm.x86.mmx.padd.d(x86
;
define void @build_v2i32_01(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
-; X86-MMX-LABEL: build_v2i32_01:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 12(%ebp), %ecx
-; X86-MMX-NEXT: movl 16(%ebp), %edx
-; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl %ecx, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
-;
-; X86-SSE-LABEL: build_v2i32_01:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movlps %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
-; X86-SSE-NEXT: retl
+; X86-LABEL: build_v2i32_01:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: paddd %mm1, %mm1
+; X86-NEXT: movq %mm1, (%eax)
+; X86-NEXT: retl
;
-; X64-SSE-LABEL: build_v2i32_01:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: movd %esi, %xmm1
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: build_v2i32_01:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovd %esi, %xmm0
-; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v2i32_01:
+; X64: # %bb.0:
+; X64-NEXT: movd %edx, %mm0
+; X64-NEXT: movd %esi, %mm1
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: paddd %mm1, %mm1
+; X64-NEXT: movq %mm1, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
@@ -103,76 +67,29 @@ define void @build_v2i32_0z(x86_mmx *%p0
define void @build_v2i32_u1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-MMX-LABEL: build_v2i32_u1:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2i32_u1:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X86-SSE-NEXT: movq %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
-; X64-SSE-LABEL: build_v2i32_u1:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v2i32_u1:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovd %edx, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v2i32_u1:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovd %edx, %xmm0
-; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v2i32_u1:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovd %edx, %xmm0
-; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X64-LABEL: build_v2i32_u1:
+; X64: # %bb.0:
+; X64-NEXT: movd %edx, %mm0
+; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 undef, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
@@ -182,63 +99,24 @@ define void @build_v2i32_u1(x86_mmx *%p0
}
define void @build_v2i32_z1(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
-; X86-MMX-LABEL: build_v2i32_z1:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl $0, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
-;
-; X86-SSE-LABEL: build_v2i32_z1:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; X86-SSE-NEXT: movq %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
-; X86-SSE-NEXT: retl
+; X86-LABEL: build_v2i32_z1:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: pxor %mm1, %mm1
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: paddd %mm1, %mm1
+; X86-NEXT: movq %mm1, (%eax)
+; X86-NEXT: retl
;
-; X64-SSE-LABEL: build_v2i32_z1:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: # kill: def $edx killed $edx def $rdx
-; X64-SSE-NEXT: movq %rdx, %xmm0
-; X64-SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: build_v2i32_z1:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: # kill: def $edx killed $edx def $rdx
-; X64-AVX-NEXT: vmovq %rdx, %xmm0
-; X64-AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v2i32_z1:
+; X64: # %bb.0:
+; X64-NEXT: movd %edx, %mm0
+; X64-NEXT: pxor %mm1, %mm1
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: paddd %mm1, %mm1
+; X64-NEXT: movq %mm1, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a1, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
@@ -250,77 +128,29 @@ define void @build_v2i32_z1(x86_mmx *%p0
define void @build_v2i32_00(x86_mmx *%p0, i32 %a0, i32 %a1) nounwind {
; X86-MMX-LABEL: build_v2i32_00:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 12(%ebp), %ecx
-; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl %ecx, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2i32_00:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-SSE-NEXT: movq %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
-; X64-SSE-LABEL: build_v2i32_00:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movd %esi, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v2i32_00:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovd %esi, %xmm0
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v2i32_00:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovd %esi, %xmm0
-; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v2i32_00:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovd %esi, %xmm0
-; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X64-LABEL: build_v2i32_00:
+; X64: # %bb.0:
+; X64-NEXT: movd %esi, %mm0
+; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x i32> undef, i32 %a0, i32 0
%2 = insertelement <2 x i32> %1, i32 %a0, i32 1
%3 = bitcast <2 x i32> %2 to x86_mmx
@@ -334,95 +164,32 @@ define void @build_v2i32_00(x86_mmx *%p0
;
define void @build_v4i16_0123(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
-; X86-MMX-LABEL: build_v4i16_0123:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 24(%ebp), %ecx
-; X86-MMX-NEXT: shll $16, %ecx
-; X86-MMX-NEXT: movzwl 20(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: shll $16, %ecx
-; X86-MMX-NEXT: movzwl 12(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
-;
-; X86-SSE-LABEL: build_v4i16_0123:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0
-; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0
-; X86-SSE-NEXT: pinsrw $3, 24(%ebp), %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
-; X86-SSE-NEXT: retl
+; X86-LABEL: build_v4i16_0123:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
+; X86-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1]
+; X86-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0]
+; X86-NEXT: paddd %mm2, %mm2
+; X86-NEXT: movq %mm2, (%eax)
+; X86-NEXT: retl
;
-; X64-SSE2-LABEL: build_v4i16_0123:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movd %r8d, %xmm0
-; X64-SSE2-NEXT: movd %ecx, %xmm1
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE2-NEXT: movd %edx, %xmm0
-; X64-SSE2-NEXT: movd %esi, %xmm2
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v4i16_0123:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movd %r8d, %xmm0
-; X64-SSSE3-NEXT: movd %ecx, %xmm1
-; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSSE3-NEXT: movd %edx, %xmm0
-; X64-SSSE3-NEXT: movd %esi, %xmm2
-; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X64-SSSE3-NEXT: movq %xmm2, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX-LABEL: build_v4i16_0123:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovd %esi, %xmm0
-; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrd $3, %r8d, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v4i16_0123:
+; X64: # %bb.0:
+; X64-NEXT: movd %r8d, %mm0
+; X64-NEXT: movd %ecx, %mm1
+; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X64-NEXT: movd %edx, %mm0
+; X64-NEXT: movd %esi, %mm2
+; X64-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1]
+; X64-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0]
+; X64-NEXT: paddd %mm2, %mm2
+; X64-NEXT: movq %mm2, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 %a2, i32 2
@@ -434,105 +201,30 @@ define void @build_v4i16_0123(x86_mmx *%
}
define void @build_v4i16_01zz(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
-; X86-MMX-LABEL: build_v4i16_01zz:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: shll $16, %ecx
-; X86-MMX-NEXT: movzwl 12(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, (%esp)
-; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
+; X86-LABEL: build_v4i16_01zz:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X86-NEXT: pxor %mm0, %mm0
+; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
+; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-NEXT: paddd %mm1, %mm1
+; X86-NEXT: movq %mm1, (%eax)
+; X86-NEXT: retl
;
-; X86-SSE2-LABEL: build_v4i16_01zz:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-8, %esp
-; X86-SSE2-NEXT: subl $8, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: movq %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v4i16_01zz:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-8, %esp
-; X86-SSSE3-NEXT: subl $8, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X86-SSSE3-NEXT: movq %xmm1, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v4i16_01zz:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movd %edx, %xmm0
-; X64-SSE2-NEXT: movd %esi, %xmm1
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v4i16_01zz:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movd %edx, %xmm0
-; X64-SSSE3-NEXT: movd %esi, %xmm1
-; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX-LABEL: build_v4i16_01zz:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovd %edx, %xmm0
-; X64-AVX-NEXT: vmovd %esi, %xmm1
-; X64-AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v4i16_01zz:
+; X64: # %bb.0:
+; X64-NEXT: movd %edx, %mm0
+; X64-NEXT: movd %esi, %mm1
+; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1]
+; X64-NEXT: pxor %mm0, %mm0
+; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: paddd %mm1, %mm1
+; X64-NEXT: movq %mm1, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 0, i32 2
@@ -596,74 +288,30 @@ define void @build_v4i16_0zuz(x86_mmx *%
}
define void @build_v4i16_012u(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
-; X86-MMX-LABEL: build_v4i16_012u:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: shll $16, %ecx
-; X86-MMX-NEXT: movzwl 12(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, (%esp)
-; X86-MMX-NEXT: shll $16, %ecx
-; X86-MMX-NEXT: movzwl 20(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
-;
-; X86-SSE-LABEL: build_v4i16_012u:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $8, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pinsrw $1, 16(%ebp), %xmm0
-; X86-SSE-NEXT: pinsrw $2, 20(%ebp), %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
-; X86-SSE-NEXT: retl
+; X86-LABEL: build_v4i16_012u:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
+; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: paddd %mm2, %mm2
+; X86-NEXT: movq %mm2, (%eax)
+; X86-NEXT: retl
;
-; X64-SSE-LABEL: build_v4i16_012u:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: movd %esi, %xmm1
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: build_v4i16_012u:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovd %esi, %xmm0
-; X64-AVX-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v4i16_012u:
+; X64: # %bb.0:
+; X64-NEXT: movd %ecx, %mm0
+; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
+; X64-NEXT: movd %edx, %mm1
+; X64-NEXT: movd %esi, %mm2
+; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X64-NEXT: paddd %mm2, %mm2
+; X64-NEXT: movq %mm2, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 %a1, i32 1
%3 = insertelement <4 x i16> %2, i16 %a2, i32 2
@@ -677,117 +325,30 @@ define void @build_v4i16_012u(x86_mmx *%
define void @build_v4i16_0u00(x86_mmx *%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind {
; X86-MMX-LABEL: build_v4i16_0u00:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movzwl 12(%ebp), %ecx
-; X86-MMX-NEXT: movl %ecx, %edx
-; X86-MMX-NEXT: shll $16, %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: shll $16, %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
+; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
-; X86-SSE2-LABEL: build_v4i16_0u00:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-8, %esp
-; X86-SSE2-NEXT: subl $8, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
-; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: movq %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v4i16_0u00:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-8, %esp
-; X86-SSSE3-NEXT: subl $8, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
-; X86-SSSE3-NEXT: movq %xmm0, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v4i16_0u00:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movd %esi, %xmm0
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,0]
-; X64-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v4i16_0u00:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movd %esi, %xmm0
-; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
-; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v4i16_0u00:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovd %esi, %xmm0
-; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,0,1,0,1,0,1,0,1,0,1,2,3]
-; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v4i16_0u00:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovd %esi, %xmm0
-; X64-AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v4i16_0u00:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovd %esi, %xmm0
-; X64-AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; X64-AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X86-SSE-LABEL: build_v4i16_0u00:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-SSE-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
+; X86-SSE-NEXT: paddd %mm0, %mm0
+; X86-SSE-NEXT: movq %mm0, (%eax)
+; X86-SSE-NEXT: retl
+;
+; X64-LABEL: build_v4i16_0u00:
+; X64: # %bb.0:
+; X64-NEXT: movd %esi, %mm0
+; X64-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <4 x i16> undef, i16 %a0, i32 0
%2 = insertelement <4 x i16> %1, i16 undef, i32 1
%3 = insertelement <4 x i16> %2, i16 %a0, i32 2
@@ -803,124 +364,48 @@ define void @build_v4i16_0u00(x86_mmx *%
;
define void @build_v8i8_01234567(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
-; X86-MMX-LABEL: build_v8i8_01234567:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: pushl %esi
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $16, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 40(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 36(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: shll $16, %edx
-; X86-MMX-NEXT: movl 32(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 28(%ebp), %esi
-; X86-MMX-NEXT: orl %ecx, %esi
-; X86-MMX-NEXT: movzwl %si, %ecx
-; X86-MMX-NEXT: orl %edx, %ecx
-; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl 24(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 20(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: shll $16, %edx
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 12(%ebp), %esi
-; X86-MMX-NEXT: orl %ecx, %esi
-; X86-MMX-NEXT: movzwl %si, %ecx
-; X86-MMX-NEXT: orl %edx, %ecx
-; X86-MMX-NEXT: movl %ecx, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: leal -4(%ebp), %esp
-; X86-MMX-NEXT: popl %esi
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
-;
-; X86-SSE-LABEL: build_v8i8_01234567:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $16, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movl 24(%ebp), %ecx
-; X86-SSE-NEXT: shll $8, %ecx
-; X86-SSE-NEXT: movzbl 20(%ebp), %edx
-; X86-SSE-NEXT: orl %ecx, %edx
-; X86-SSE-NEXT: movl 16(%ebp), %ecx
-; X86-SSE-NEXT: shll $8, %ecx
-; X86-SSE-NEXT: movzbl 12(%ebp), %esi
-; X86-SSE-NEXT: orl %ecx, %esi
-; X86-SSE-NEXT: movd %esi, %xmm0
-; X86-SSE-NEXT: pinsrw $1, %edx, %xmm0
-; X86-SSE-NEXT: movl 32(%ebp), %ecx
-; X86-SSE-NEXT: shll $8, %ecx
-; X86-SSE-NEXT: movzbl 28(%ebp), %edx
-; X86-SSE-NEXT: orl %ecx, %edx
-; X86-SSE-NEXT: pinsrw $2, %edx, %xmm0
-; X86-SSE-NEXT: movl 40(%ebp), %ecx
-; X86-SSE-NEXT: shll $8, %ecx
-; X86-SSE-NEXT: movzbl 36(%ebp), %edx
-; X86-SSE-NEXT: orl %ecx, %edx
-; X86-SSE-NEXT: pinsrw $3, %edx, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: leal -4(%ebp), %esp
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: popl %ebp
-; X86-SSE-NEXT: retl
+; X86-LABEL: build_v8i8_01234567:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
+; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
+; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm3
+; X86-NEXT: punpcklbw %mm0, %mm3 # mm3 = mm3[0],mm0[0],mm3[1],mm0[1],mm3[2],mm0[2],mm3[3],mm0[3]
+; X86-NEXT: punpcklwd %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1]
+; X86-NEXT: punpckldq %mm2, %mm3 # mm3 = mm3[0],mm2[0]
+; X86-NEXT: paddd %mm3, %mm3
+; X86-NEXT: movq %mm3, (%eax)
+; X86-NEXT: retl
;
-; X64-SSE-LABEL: build_v8i8_01234567:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: shll $8, %r8d
-; X64-SSE-NEXT: movzbl %cl, %eax
-; X64-SSE-NEXT: orl %r8d, %eax
-; X64-SSE-NEXT: shll $8, %edx
-; X64-SSE-NEXT: movzbl %sil, %ecx
-; X64-SSE-NEXT: orl %edx, %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pinsrw $1, %eax, %xmm0
-; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT: shll $8, %eax
-; X64-SSE-NEXT: movzbl %r9b, %ecx
-; X64-SSE-NEXT: orl %eax, %ecx
-; X64-SSE-NEXT: pinsrw $2, %ecx, %xmm0
-; X64-SSE-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; X64-SSE-NEXT: shll $8, %eax
-; X64-SSE-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx
-; X64-SSE-NEXT: orl %eax, %ecx
-; X64-SSE-NEXT: pinsrw $3, %ecx, %xmm0
-; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: build_v8i8_01234567:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovd %esi, %xmm0
-; X64-AVX-NEXT: vpinsrb $1, %edx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrb $3, %r8d, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrb $4, %r9d, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrb $5, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrb $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrb $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v8i8_01234567:
+; X64: # %bb.0:
+; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm0
+; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm1
+; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: movd %r9d, %mm0
+; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm2
+; X64-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3]
+; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
+; X64-NEXT: movd %r8d, %mm1
+; X64-NEXT: movd %ecx, %mm2
+; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3]
+; X64-NEXT: movd %edx, %mm1
+; X64-NEXT: movd %esi, %mm3
+; X64-NEXT: punpcklbw %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1],mm3[2],mm1[2],mm3[3],mm1[3]
+; X64-NEXT: punpcklwd %mm2, %mm3 # mm3 = mm3[0],mm2[0],mm3[1],mm2[1]
+; X64-NEXT: punpckldq %mm0, %mm3 # mm3 = mm3[0],mm0[0]
+; X64-NEXT: paddd %mm3, %mm3
+; X64-NEXT: movq %mm3, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 %a1, i32 1
%3 = insertelement <8 x i8> %2, i8 %a2, i32 2
@@ -936,158 +421,46 @@ define void @build_v8i8_01234567(x86_mmx
}
define void @build_v8i8_0u2345z7(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
-; X86-MMX-LABEL: build_v8i8_0u2345z7:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 24(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 20(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: shll $16, %edx
-; X86-MMX-NEXT: movzbl 12(%ebp), %ecx
-; X86-MMX-NEXT: orl %edx, %ecx
-; X86-MMX-NEXT: movl %ecx, (%esp)
-; X86-MMX-NEXT: movl 32(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 28(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movzwl %dx, %ecx
-; X86-MMX-NEXT: movl 40(%ebp), %edx
-; X86-MMX-NEXT: shll $24, %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
+; X86-LABEL: build_v8i8_0u2345z7:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: pxor %mm1, %mm1
+; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
+; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
+; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
+; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
+; X86-NEXT: punpckldq %mm2, %mm0 # mm0 = mm0[0],mm2[0]
+; X86-NEXT: paddd %mm0, %mm0
+; X86-NEXT: movq %mm0, (%eax)
+; X86-NEXT: retl
;
-; X86-SSE2-LABEL: build_v8i8_0u2345z7:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-8, %esp
-; X86-SSE2-NEXT: subl $8, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: pxor %xmm1, %xmm1
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
-; X86-SSE2-NEXT: movq %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v8i8_0u2345z7:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-8, %esp
-; X86-SSSE3-NEXT: subl $8, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: pxor %xmm1, %xmm1
-; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u]
-; X86-SSSE3-NEXT: movq %xmm0, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v8i8_0u2345z7:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE2-NEXT: pxor %xmm1, %xmm1
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-SSE2-NEXT: movd %r9d, %xmm0
-; X64-SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE2-NEXT: movd %r8d, %xmm1
-; X64-SSE2-NEXT: movd %ecx, %xmm2
-; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-SSE2-NEXT: movd %esi, %xmm1
-; X64-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm1
-; X64-SSE2-NEXT: packuswb %xmm1, %xmm1
-; X64-SSE2-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v8i8_0u2345z7:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSSE3-NEXT: pxor %xmm1, %xmm1
-; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-SSSE3-NEXT: movd %r9d, %xmm0
-; X64-SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSSE3-NEXT: movd %r8d, %xmm1
-; X64-SSSE3-NEXT: movd %ecx, %xmm2
-; X64-SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X64-SSSE3-NEXT: movd %esi, %xmm1
-; X64-SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X64-SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,u,4,6,8,10],zero,xmm1[14,u,u,u,u,u,u,u,u]
-; X64-SSSE3-NEXT: movq %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX-LABEL: build_v8i8_0u2345z7:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $4, %r9d, %xmm0, %xmm0
-; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; X64-AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0
-; X64-AVX-NEXT: movl {{[0-9]+}}(%rsp), %eax
-; X64-AVX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,u,4,6,8,10],zero,xmm0[14,u,u,u,u,u,u,u,u]
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v8i8_0u2345z7:
+; X64: # %bb.0:
+; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm0
+; X64-NEXT: pxor %mm1, %mm1
+; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: movd %r9d, %mm0
+; X64-NEXT: movd {{[0-9]+}}(%rsp), %mm2
+; X64-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3]
+; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
+; X64-NEXT: movd %r8d, %mm1
+; X64-NEXT: movd %ecx, %mm2
+; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3]
+; X64-NEXT: movd %esi, %mm1
+; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: punpcklwd %mm2, %mm1 # mm1 = mm1[0],mm2[0],mm1[1],mm2[1]
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: paddd %mm1, %mm1
+; X64-NEXT: movq %mm1, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 undef, i32 1
%3 = insertelement <8 x i8> %2, i8 %a2, i32 2
@@ -1103,128 +476,44 @@ define void @build_v8i8_0u2345z7(x86_mmx
}
define void @build_v8i8_0123zzzu(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
-; X86-MMX-LABEL: build_v8i8_0123zzzu:
-; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: pushl %esi
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $16, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movl 24(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 20(%ebp), %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: shll $16, %edx
-; X86-MMX-NEXT: movl 16(%ebp), %ecx
-; X86-MMX-NEXT: shll $8, %ecx
-; X86-MMX-NEXT: movzbl 12(%ebp), %esi
-; X86-MMX-NEXT: orl %ecx, %esi
-; X86-MMX-NEXT: movzwl %si, %ecx
-; X86-MMX-NEXT: orl %edx, %ecx
-; X86-MMX-NEXT: movl %ecx, (%esp)
-; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: leal -4(%ebp), %esp
-; X86-MMX-NEXT: popl %esi
-; X86-MMX-NEXT: popl %ebp
-; X86-MMX-NEXT: retl
+; X86-LABEL: build_v8i8_0123zzzu:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2
+; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
+; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X86-NEXT: pxor %mm0, %mm0
+; X86-NEXT: pxor %mm1, %mm1
+; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
+; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
+; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X86-NEXT: paddd %mm2, %mm2
+; X86-NEXT: movq %mm2, (%eax)
+; X86-NEXT: retl
;
-; X86-SSE2-LABEL: build_v8i8_0123zzzu:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-8, %esp
-; X86-SSE2-NEXT: subl $8, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movl 12(%ebp), %ecx
-; X86-SSE2-NEXT: pxor %xmm0, %xmm0
-; X86-SSE2-NEXT: pinsrw $0, %ecx, %xmm0
-; X86-SSE2-NEXT: movl 16(%ebp), %ecx
-; X86-SSE2-NEXT: pinsrw $1, %ecx, %xmm0
-; X86-SSE2-NEXT: movl 20(%ebp), %ecx
-; X86-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
-; X86-SSE2-NEXT: movl 24(%ebp), %ecx
-; X86-SSE2-NEXT: pinsrw $3, %ecx, %xmm0
-; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
-; X86-SSE2-NEXT: movq %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v8i8_0123zzzu:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-8, %esp
-; X86-SSSE3-NEXT: subl $8, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: movl 12(%ebp), %ecx
-; X86-SSSE3-NEXT: pxor %xmm0, %xmm0
-; X86-SSSE3-NEXT: pinsrw $0, %ecx, %xmm0
-; X86-SSSE3-NEXT: movl 16(%ebp), %ecx
-; X86-SSSE3-NEXT: pinsrw $1, %ecx, %xmm0
-; X86-SSSE3-NEXT: movl 20(%ebp), %ecx
-; X86-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
-; X86-SSSE3-NEXT: movl 24(%ebp), %ecx
-; X86-SSSE3-NEXT: pinsrw $3, %ecx, %xmm0
-; X86-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
-; X86-SSSE3-NEXT: movq %xmm0, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v8i8_0123zzzu:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: pxor %xmm0, %xmm0
-; X64-SSE2-NEXT: pinsrw $0, %esi, %xmm0
-; X64-SSE2-NEXT: pinsrw $1, %edx, %xmm0
-; X64-SSE2-NEXT: pinsrw $2, %ecx, %xmm0
-; X64-SSE2-NEXT: pinsrw $3, %r8d, %xmm0
-; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; X64-SSE2-NEXT: packuswb %xmm0, %xmm0
-; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v8i8_0123zzzu:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: pxor %xmm0, %xmm0
-; X64-SSSE3-NEXT: pinsrw $0, %esi, %xmm0
-; X64-SSSE3-NEXT: pinsrw $1, %edx, %xmm0
-; X64-SSSE3-NEXT: pinsrw $2, %ecx, %xmm0
-; X64-SSSE3-NEXT: pinsrw $3, %r8d, %xmm0
-; X64-SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
-; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX-LABEL: build_v8i8_0123zzzu:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $0, %esi, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $2, %ecx, %xmm0, %xmm0
-; X64-AVX-NEXT: vpinsrw $3, %r8d, %xmm0, %xmm0
-; X64-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v8i8_0123zzzu:
+; X64: # %bb.0:
+; X64-NEXT: movd %r8d, %mm0
+; X64-NEXT: movd %ecx, %mm1
+; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: movd %edx, %mm0
+; X64-NEXT: movd %esi, %mm2
+; X64-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3]
+; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1]
+; X64-NEXT: pxor %mm0, %mm0
+; X64-NEXT: pxor %mm1, %mm1
+; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3]
+; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
+; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1]
+; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0]
+; X64-NEXT: paddd %mm2, %mm2
+; X64-NEXT: movq %mm2, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 %a1, i32 1
%3 = insertelement <8 x i8> %2, i8 %a2, i32 2
@@ -1302,119 +591,33 @@ define void @build_v8i8_0zzzzzzu(x86_mmx
define void @build_v8i8_00000000(x86_mmx *%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind {
; X86-MMX-LABEL: build_v8i8_00000000:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: movzbl 12(%ebp), %ecx
-; X86-MMX-NEXT: movl %ecx, %edx
-; X86-MMX-NEXT: shll $8, %edx
-; X86-MMX-NEXT: orl %ecx, %edx
-; X86-MMX-NEXT: movl %edx, %ecx
-; X86-MMX-NEXT: shll $16, %ecx
-; X86-MMX-NEXT: orl %edx, %ecx
-; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl %ecx, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
+; X86-MMX-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1]
+; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
-; X86-SSE2-LABEL: build_v8i8_00000000:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-8, %esp
-; X86-SSE2-NEXT: subl $8, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X86-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X86-SSE2-NEXT: packuswb %xmm0, %xmm0
-; X86-SSE2-NEXT: movq %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v8i8_00000000:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-8, %esp
-; X86-SSSE3-NEXT: subl $8, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: pxor %xmm0, %xmm0
-; X86-SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: pshufb %xmm0, %xmm1
-; X86-SSSE3-NEXT: movq %xmm1, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v8i8_00000000:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movd %esi, %xmm0
-; X64-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; X64-SSE2-NEXT: packuswb %xmm0, %xmm0
-; X64-SSE2-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v8i8_00000000:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movd %esi, %xmm0
-; X64-SSSE3-NEXT: pxor %xmm1, %xmm1
-; X64-SSSE3-NEXT: pshufb %xmm1, %xmm0
-; X64-SSSE3-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v8i8_00000000:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovd %esi, %xmm0
-; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v8i8_00000000:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovd %esi, %xmm0
-; X64-AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; X64-AVX2-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v8i8_00000000:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovd %esi, %xmm0
-; X64-AVX512-NEXT: vpbroadcastb %xmm0, %xmm0
-; X64-AVX512-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X86-SSE-LABEL: build_v8i8_00000000:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-SSE-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
+; X86-SSE-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
+; X86-SSE-NEXT: paddd %mm0, %mm0
+; X86-SSE-NEXT: movq %mm0, (%eax)
+; X86-SSE-NEXT: retl
+;
+; X64-LABEL: build_v8i8_00000000:
+; X64: # %bb.0:
+; X64-NEXT: movd %esi, %mm0
+; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3]
+; X64-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <8 x i8> undef, i8 %a0, i32 0
%2 = insertelement <8 x i8> %1, i8 %a0, i32 1
%3 = insertelement <8 x i8> %2, i8 %a0, i32 2
@@ -1436,55 +639,34 @@ define void @build_v8i8_00000000(x86_mmx
define void @build_v2f32_01(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_01:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: flds 12(%ebp)
-; X86-MMX-NEXT: flds 16(%ebp)
-; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: fstps (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-MMX-NEXT: paddd %mm1, %mm1
+; X86-MMX-NEXT: movq %mm1, (%eax)
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2f32_01:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-16, %esp
-; X86-SSE-NEXT: subl $32, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
-; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movaps %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdq2q %xmm0, %mm0
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdq2q %xmm0, %mm1
+; X86-SSE-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-SSE-NEXT: paddd %mm1, %mm1
+; X86-SSE-NEXT: movq %mm1, (%eax)
; X86-SSE-NEXT: retl
;
-; X64-SSE-LABEL: build_v2f32_01:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: build_v2f32_01:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v2f32_01:
+; X64: # %bb.0:
+; X64-NEXT: movdq2q %xmm1, %mm0
+; X64-NEXT: movdq2q %xmm0, %mm1
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: paddd %mm1, %mm1
+; X64-NEXT: movq %mm1, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
@@ -1496,76 +678,33 @@ define void @build_v2f32_01(x86_mmx *%p0
define void @build_v2f32_0z(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_0z:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: flds 12(%ebp)
-; X86-MMX-NEXT: fstps (%esp)
-; X86-MMX-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: pxor %mm0, %mm0
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-MMX-NEXT: paddd %mm1, %mm1
+; X86-MMX-NEXT: movq %mm1, (%eax)
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2f32_0z:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-16, %esp
-; X86-SSE-NEXT: subl $32, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movaps %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
+; X86-SSE-NEXT: movdq2q %xmm0, %mm0
+; X86-SSE-NEXT: pxor %mm1, %mm1
+; X86-SSE-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
; X86-SSE-NEXT: paddd %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
; X86-SSE-NEXT: retl
;
-; X64-SSE-LABEL: build_v2f32_0z:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: xorps %xmm1, %xmm1
-; X64-SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
-; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v2f32_0z:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v2f32_0z:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v2f32_0z:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
-; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X64-LABEL: build_v2f32_0z:
+; X64: # %bb.0:
+; X64-NEXT: movdq2q %xmm0, %mm0
+; X64-NEXT: pxor %mm1, %mm1
+; X64-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float 0.0, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
@@ -1577,98 +716,30 @@ define void @build_v2f32_0z(x86_mmx *%p0
define void @build_v2f32_u1(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_u1:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: flds 16(%ebp)
-; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
-; X86-SSE2-LABEL: build_v2f32_u1:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-16, %esp
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
-; X86-SSE2-NEXT: movaps %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v2f32_u1:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-16, %esp
-; X86-SSSE3-NEXT: subl $32, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X86-SSSE3-NEXT: movaps %xmm0, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v2f32_u1:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,2,3]
-; X64-SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v2f32_u1:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v2f32_u1:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v2f32_u1:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastss %xmm1, %xmm0
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v2f32_u1:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastss %xmm1, %xmm0
-; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X86-SSE-LABEL: build_v2f32_u1:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdq2q %xmm0, %mm0
+; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
+; X86-SSE-NEXT: paddd %mm0, %mm0
+; X86-SSE-NEXT: movq %mm0, (%eax)
+; X86-SSE-NEXT: retl
+;
+; X64-LABEL: build_v2f32_u1:
+; X64: # %bb.0:
+; X64-NEXT: movdq2q %xmm1, %mm0
+; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float undef, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
@@ -1680,59 +751,33 @@ define void @build_v2f32_u1(x86_mmx *%p0
define void @build_v2f32_z1(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_z1:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: flds 16(%ebp)
-; X86-MMX-NEXT: fstps {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: movl $0, (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
-; X86-MMX-NEXT: paddd %mm0, %mm0
-; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: pxor %mm1, %mm1
+; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-MMX-NEXT: paddd %mm1, %mm1
+; X86-MMX-NEXT: movq %mm1, (%eax)
; X86-MMX-NEXT: retl
;
; X86-SSE-LABEL: build_v2f32_z1:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %ebp
-; X86-SSE-NEXT: movl %esp, %ebp
-; X86-SSE-NEXT: andl $-16, %esp
-; X86-SSE-NEXT: subl $32, %esp
-; X86-SSE-NEXT: movl 8(%ebp), %eax
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: xorps %xmm1, %xmm1
-; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
-; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
-; X86-SSE-NEXT: movaps %xmm0, (%esp)
-; X86-SSE-NEXT: movq (%esp), %mm0
-; X86-SSE-NEXT: paddd %mm0, %mm0
-; X86-SSE-NEXT: movq %mm0, (%eax)
-; X86-SSE-NEXT: movl %ebp, %esp
-; X86-SSE-NEXT: popl %ebp
+; X86-SSE-NEXT: movdq2q %xmm0, %mm0
+; X86-SSE-NEXT: pxor %mm1, %mm1
+; X86-SSE-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X86-SSE-NEXT: paddd %mm1, %mm1
+; X86-SSE-NEXT: movq %mm1, (%eax)
; X86-SSE-NEXT: retl
;
-; X64-SSE-LABEL: build_v2f32_z1:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: xorps %xmm0, %xmm0
-; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; X64-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE-NEXT: paddd %mm0, %mm0
-; X64-SSE-NEXT: movq %mm0, (%rdi)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: build_v2f32_z1:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],zero,zero
-; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX-NEXT: paddd %mm0, %mm0
-; X64-AVX-NEXT: movq %mm0, (%rdi)
-; X64-AVX-NEXT: retq
+; X64-LABEL: build_v2f32_z1:
+; X64: # %bb.0:
+; X64-NEXT: movdq2q %xmm1, %mm0
+; X64-NEXT: pxor %mm1, %mm1
+; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0]
+; X64-NEXT: paddd %mm1, %mm1
+; X64-NEXT: movq %mm1, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float 0.0, i32 0
%2 = insertelement <2 x float> %1, float %a1, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
@@ -1744,99 +789,30 @@ define void @build_v2f32_z1(x86_mmx *%p0
define void @build_v2f32_00(x86_mmx *%p0, float %a0, float %a1) nounwind {
; X86-MMX-LABEL: build_v2f32_00:
; X86-MMX: # %bb.0:
-; X86-MMX-NEXT: pushl %ebp
-; X86-MMX-NEXT: movl %esp, %ebp
-; X86-MMX-NEXT: andl $-8, %esp
-; X86-MMX-NEXT: subl $8, %esp
-; X86-MMX-NEXT: movl 8(%ebp), %eax
-; X86-MMX-NEXT: flds 12(%ebp)
-; X86-MMX-NEXT: fsts {{[0-9]+}}(%esp)
-; X86-MMX-NEXT: fstps (%esp)
-; X86-MMX-NEXT: movq (%esp), %mm0
+; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0
+; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0]
; X86-MMX-NEXT: paddd %mm0, %mm0
; X86-MMX-NEXT: movq %mm0, (%eax)
-; X86-MMX-NEXT: movl %ebp, %esp
-; X86-MMX-NEXT: popl %ebp
; X86-MMX-NEXT: retl
;
-; X86-SSE2-LABEL: build_v2f32_00:
-; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: pushl %ebp
-; X86-SSE2-NEXT: movl %esp, %ebp
-; X86-SSE2-NEXT: andl $-16, %esp
-; X86-SSE2-NEXT: subl $32, %esp
-; X86-SSE2-NEXT: movl 8(%ebp), %eax
-; X86-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
-; X86-SSE2-NEXT: movaps %xmm0, (%esp)
-; X86-SSE2-NEXT: movq (%esp), %mm0
-; X86-SSE2-NEXT: paddd %mm0, %mm0
-; X86-SSE2-NEXT: movq %mm0, (%eax)
-; X86-SSE2-NEXT: movl %ebp, %esp
-; X86-SSE2-NEXT: popl %ebp
-; X86-SSE2-NEXT: retl
-;
-; X86-SSSE3-LABEL: build_v2f32_00:
-; X86-SSSE3: # %bb.0:
-; X86-SSSE3-NEXT: pushl %ebp
-; X86-SSSE3-NEXT: movl %esp, %ebp
-; X86-SSSE3-NEXT: andl $-16, %esp
-; X86-SSSE3-NEXT: subl $32, %esp
-; X86-SSSE3-NEXT: movl 8(%ebp), %eax
-; X86-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X86-SSSE3-NEXT: movaps %xmm0, (%esp)
-; X86-SSSE3-NEXT: movq (%esp), %mm0
-; X86-SSSE3-NEXT: paddd %mm0, %mm0
-; X86-SSSE3-NEXT: movq %mm0, (%eax)
-; X86-SSSE3-NEXT: movl %ebp, %esp
-; X86-SSSE3-NEXT: popl %ebp
-; X86-SSSE3-NEXT: retl
-;
-; X64-SSE2-LABEL: build_v2f32_00:
-; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,3]
-; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSE2-NEXT: paddd %mm0, %mm0
-; X64-SSE2-NEXT: movq %mm0, (%rdi)
-; X64-SSE2-NEXT: retq
-;
-; X64-SSSE3-LABEL: build_v2f32_00:
-; X64-SSSE3: # %bb.0:
-; X64-SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X64-SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSSE3-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-SSSE3-NEXT: paddd %mm0, %mm0
-; X64-SSSE3-NEXT: movq %mm0, (%rdi)
-; X64-SSSE3-NEXT: retq
-;
-; X64-AVX1-LABEL: build_v2f32_00:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; X64-AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX1-NEXT: paddd %mm0, %mm0
-; X64-AVX1-NEXT: movq %mm0, (%rdi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: build_v2f32_00:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vbroadcastss %xmm0, %xmm0
-; X64-AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX2-NEXT: paddd %mm0, %mm0
-; X64-AVX2-NEXT: movq %mm0, (%rdi)
-; X64-AVX2-NEXT: retq
-;
-; X64-AVX512-LABEL: build_v2f32_00:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vbroadcastss %xmm0, %xmm0
-; X64-AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
-; X64-AVX512-NEXT: paddd %mm0, %mm0
-; X64-AVX512-NEXT: movq %mm0, (%rdi)
-; X64-AVX512-NEXT: retq
+; X86-SSE-LABEL: build_v2f32_00:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT: movdq2q %xmm0, %mm0
+; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
+; X86-SSE-NEXT: paddd %mm0, %mm0
+; X86-SSE-NEXT: movq %mm0, (%eax)
+; X86-SSE-NEXT: retl
+;
+; X64-LABEL: build_v2f32_00:
+; X64: # %bb.0:
+; X64-NEXT: movdq2q %xmm0, %mm0
+; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
+; X64-NEXT: paddd %mm0, %mm0
+; X64-NEXT: movq %mm0, (%rdi)
+; X64-NEXT: retq
%1 = insertelement <2 x float> undef, float %a0, i32 0
%2 = insertelement <2 x float> %1, float %a0, i32 1
%3 = bitcast <2 x float> %2 to x86_mmx
Modified: llvm/trunk/test/CodeGen/X86/pr29222.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr29222.ll?rev=327247&r1=327246&r2=327247&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr29222.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr29222.ll Sun Mar 11 12:22:13 2018
@@ -10,11 +10,9 @@ define i32 @PR29222(i32) nounwind {
; X86-SSE-NEXT: pushl %ebp
; X86-SSE-NEXT: movl %esp, %ebp
; X86-SSE-NEXT: andl $-8, %esp
-; X86-SSE-NEXT: subl $16, %esp
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; X86-SSE-NEXT: subl $8, %esp
+; X86-SSE-NEXT: movd 8(%ebp), %mm0
+; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X86-SSE-NEXT: packsswb %mm0, %mm0
; X86-SSE-NEXT: movq %mm0, (%esp)
; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@@ -29,10 +27,9 @@ define i32 @PR29222(i32) nounwind {
; X86-AVX-NEXT: pushl %ebp
; X86-AVX-NEXT: movl %esp, %ebp
; X86-AVX-NEXT: andl $-8, %esp
-; X86-AVX-NEXT: subl $16, %esp
-; X86-AVX-NEXT: vbroadcastss 8(%ebp), %xmm0
-; X86-AVX-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movq {{[0-9]+}}(%esp), %mm0
+; X86-AVX-NEXT: subl $8, %esp
+; X86-AVX-NEXT: movd 8(%ebp), %mm0
+; X86-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X86-AVX-NEXT: packsswb %mm0, %mm0
; X86-AVX-NEXT: movq %mm0, (%esp)
; X86-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
@@ -44,10 +41,8 @@ define i32 @PR29222(i32) nounwind {
;
; X64-SSE-LABEL: PR29222:
; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movd %edi, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X64-SSE-NEXT: movq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
+; X64-SSE-NEXT: movd %edi, %mm0
+; X64-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X64-SSE-NEXT: packsswb %mm0, %mm0
; X64-SSE-NEXT: movq2dq %mm0, %xmm0
; X64-SSE-NEXT: packsswb %xmm0, %xmm0
@@ -56,10 +51,8 @@ define i32 @PR29222(i32) nounwind {
;
; X64-AVX-LABEL: PR29222:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovd %edi, %xmm0
-; X64-AVX-NEXT: vpbroadcastd %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %mm0
+; X64-AVX-NEXT: movd %edi, %mm0
+; X64-AVX-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1]
; X64-AVX-NEXT: packsswb %mm0, %mm0
; X64-AVX-NEXT: movq2dq %mm0, %xmm0
; X64-AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
Modified: llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll?rev=327247&r1=327246&r2=327247&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_insert-mmx.ll Sun Mar 11 12:22:13 2018
@@ -6,12 +6,9 @@
define x86_mmx @t0(i32 %A) nounwind {
; X32-LABEL: t0:
; X32: ## %bb.0:
-; X32-NEXT: subl $12, %esp
-; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
-; X32-NEXT: movq %xmm0, (%esp)
-; X32-NEXT: movq (%esp), %mm0
-; X32-NEXT: addl $12, %esp
+; X32-NEXT: movd {{[0-9]+}}(%esp), %mm1
+; X32-NEXT: pxor %mm0, %mm0
+; X32-NEXT: punpckldq %mm1, %mm0 ## mm0 = mm0[0],mm1[0]
; X32-NEXT: retl
;
; X64-LABEL: t0:
More information about the llvm-commits
mailing list