[llvm] r293438 - [X86][SSE] Lower scalar_to_vector(0) to zero vector
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 29 10:13:38 PST 2017
Author: rksimon
Date: Sun Jan 29 12:13:37 2017
New Revision: 293438
URL: http://llvm.org/viewvc/llvm-project?rev=293438&view=rev
Log:
[X86][SSE] Lower scalar_to_vector(0) to zero vector
Replaces an xor+movd/movq with an xorps which will be shorter in codesize, avoid an int-fpu transfer, allow modern cores to fast path the result during decode and helps other combines recognise an all-zero vector.
The only reason I can think of that we'd want to keep scalar_to_vector in this case is to help recognise the upper elts are undef but this doesn't seem to be a problem.
Differential Revision: https://reviews.llvm.org/D29097
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
llvm/trunk/test/CodeGen/X86/insertelement-zero.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=293438&r1=293437&r2=293438&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Jan 29 12:13:37 2017
@@ -782,6 +782,7 @@ X86TargetLowering::X86TargetLowering(con
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
@@ -13973,15 +13974,21 @@ SDValue X86TargetLowering::LowerINSERT_V
return SDValue();
}
-static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
SDLoc dl(Op);
MVT OpVT = Op.getSimpleValueType();
+ // It's always cheaper to replace a xor+movd with xorps and simplifies further
+ // combines.
+ if (X86::isZeroNode(Op.getOperand(0)))
+ return getZeroVector(OpVT, Subtarget, DAG, dl);
+
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
if (!OpVT.is128BitVector()) {
// Insert into a 128-bit vector.
- unsigned SizeFactor = OpVT.getSizeInBits()/128;
+ unsigned SizeFactor = OpVT.getSizeInBits() / 128;
MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
OpVT.getVectorNumElements() / SizeFactor);
@@ -13990,9 +13997,13 @@ static SDValue LowerSCALAR_TO_VECTOR(SDV
// Insert the 128-bit vector.
return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
}
+ assert(OpVT.is128BitVector() && "Expected an SSE type!");
+
+ // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
+ if (OpVT == MVT::v4i32)
+ return Op;
SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
- assert(OpVT.is128BitVector() && "Expected an SSE type!");
return DAG.getBitcast(
OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
}
@@ -23342,7 +23353,7 @@ SDValue X86TargetLowering::LowerOperatio
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
- case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
+ case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
Modified: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=293438&r1=293437&r2=293438&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll Sun Jan 29 12:13:37 2017
@@ -301,9 +301,8 @@ define <16 x i8> @_clearupper16xi8a(<16
define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
; SSE-LABEL: _clearupper2xi64b:
; SSE: # BB#0:
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
@@ -356,32 +355,7 @@ define <4 x i32> @_clearupper4xi32b(<4 x
define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
; SSE-LABEL: _clearupper8xi16b:
; SSE: # BB#0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psllw $8, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm1
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: pslld $24, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm1
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm3
-; SSE-NEXT: psllq $40, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm1
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: psllq $56, %xmm2
-; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: _clearupper8xi16b:
Modified: llvm/trunk/test/CodeGen/X86/insertelement-zero.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/insertelement-zero.ll?rev=293438&r1=293437&r2=293438&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/insertelement-zero.ll (original)
+++ llvm/trunk/test/CodeGen/X86/insertelement-zero.ll Sun Jan 29 12:13:37 2017
@@ -244,24 +244,21 @@ define <8 x float> @insert_v8f32_z12345z
define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) {
; SSE2-LABEL: insert_v4i32_01z3:
; SSE2: # BB#0:
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v4i32_01z3:
; SSE3: # BB#0:
-; SSE3-NEXT: xorl %eax, %eax
-; SSE3-NEXT: movd %eax, %xmm1
+; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v4i32_01z3:
; SSSE3: # BB#0:
-; SSSE3-NEXT: xorl %eax, %eax
-; SSSE3-NEXT: movd %eax, %xmm1
+; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSSE3-NEXT: retq
@@ -292,8 +289,7 @@ define <8 x i32> @insert_v8i32_z12345z7(
; SSE2: # BB#0:
; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: movd %eax, %xmm2
+; SSE2-NEXT: xorps %xmm2, %xmm2
; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; SSE2-NEXT: retq
@@ -302,8 +298,7 @@ define <8 x i32> @insert_v8i32_z12345z7(
; SSE3: # BB#0:
; SSE3-NEXT: xorps %xmm2, %xmm2
; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSE3-NEXT: xorl %eax, %eax
-; SSE3-NEXT: movd %eax, %xmm2
+; SSE3-NEXT: xorps %xmm2, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; SSE3-NEXT: retq
@@ -312,8 +307,7 @@ define <8 x i32> @insert_v8i32_z12345z7(
; SSSE3: # BB#0:
; SSSE3-NEXT: xorps %xmm2, %xmm2
; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
-; SSSE3-NEXT: xorl %eax, %eax
-; SSSE3-NEXT: movd %eax, %xmm2
+; SSSE3-NEXT: xorps %xmm2, %xmm2
; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2]
; SSSE3-NEXT: retq
@@ -443,24 +437,12 @@ define <16 x i16> @insert_v16i16_z12345z
define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) {
; SSE2-LABEL: insert_v16i8_z123456789ABCDEz:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v16i8_z123456789ABCDEz:
; SSE3: # BB#0:
-; SSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE3-NEXT: pand %xmm1, %xmm0
-; SSE3-NEXT: xorl %eax, %eax
-; SSE3-NEXT: movd %eax, %xmm2
-; SSE3-NEXT: pandn %xmm2, %xmm1
-; SSE3-NEXT: por %xmm1, %xmm0
-; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz:
@@ -489,25 +471,13 @@ define <16 x i8> @insert_v16i8_z12345678
define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
; SSE2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: andps {{.*}}(%rip), %xmm1
; SSE2-NEXT: retq
;
; SSE3-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
; SSE3: # BB#0:
-; SSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; SSE3-NEXT: pand %xmm2, %xmm0
-; SSE3-NEXT: xorl %eax, %eax
-; SSE3-NEXT: movd %eax, %xmm3
-; SSE3-NEXT: pandn %xmm3, %xmm2
-; SSE3-NEXT: por %xmm2, %xmm0
-; SSE3-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: andps {{.*}}(%rip), %xmm1
; SSE3-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll?rev=293438&r1=293437&r2=293438&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-variable-128.ll Sun Jan 29 12:13:37 2017
@@ -1329,28 +1329,27 @@ define <8 x i16> @var_shuffle_v8i16_v8i1
; SSE2-NEXT: andl $7, %ecx
; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: andl $7, %r8d
-; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: andl $7, %r9d
; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax
-; SSE2-NEXT: xorl %esi, %esi
-; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: movd %eax, %xmm2
; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSE2-NEXT: movzwl -40(%rsp,%rdx,2), %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT: movd %ecx, %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT: movd %eax, %xmm1
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE2-NEXT: retq
;
@@ -1368,28 +1367,27 @@ define <8 x i16> @var_shuffle_v8i16_v8i1
; SSSE3-NEXT: andl $7, %ecx
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $7, %r8d
-; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; SSSE3-NEXT: andl $7, %r9d
; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax
-; SSSE3-NEXT: xorl %esi, %esi
-; SSSE3-NEXT: movd %esi, %xmm0
; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %ecx, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-NEXT: movd %eax, %xmm2
; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax
; SSSE3-NEXT: movd %eax, %xmm3
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax
; SSSE3-NEXT: movzwl -40(%rsp,%rdx,2), %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT: movd %ecx, %xmm3
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax
-; SSSE3-NEXT: movd %eax, %xmm3
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT: movd %eax, %xmm1
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSSE3-NEXT: retq
;
More information about the llvm-commits
mailing list