[llvm] r323541 - [X86][SSE] Don't colaesce v4i32 extracts
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 26 09:11:34 PST 2018
Author: rksimon
Date: Fri Jan 26 09:11:34 2018
New Revision: 323541
URL: http://llvm.org/viewvc/llvm-project?rev=323541&view=rev
Log:
[X86][SSE] Don't colaesce v4i32 extracts
We currently coalesce v4i32 extracts from all 4 elements to 2 v2i64 extracts + shifts/sign-extends.
This seems to have been added back in the days when we tended to spill vectors and reload scalars, or ended up with repeated shuffles moving everything down to 0'th index. I don't think either of these are likely these days as we have better EXTRACT_VECTOR_ELT and VECTOR_SHUFFLE handling, and the existing code tends to make it very difficult for various vector and load combines.
Differential Revision: https://reviews.llvm.org/D42308
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/gather-addresses.ll
llvm/trunk/test/CodeGen/X86/mulvi32.ll
llvm/trunk/test/CodeGen/X86/pr18344.ll
llvm/trunk/test/CodeGen/X86/pr21792.ll
llvm/trunk/test/CodeGen/X86/var-permute-128.ll
llvm/trunk/test/CodeGen/X86/var-permute-256.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jan 26 09:11:34 2018
@@ -31239,102 +31239,7 @@ static SDValue combineExtractVectorElt(S
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
return MinMax;
- // Only operate on vectors of 4 elements, where the alternative shuffling
- // gets to be more expensive.
- if (SrcVT != MVT::v4i32)
- return SDValue();
-
- // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
- // single use which is a sign-extend or zero-extend, and all elements are
- // used.
- SmallVector<SDNode *, 4> Uses;
- unsigned ExtractedElements = 0;
- for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
- UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
- if (UI.getUse().getResNo() != InputVector.getResNo())
- return SDValue();
-
- SDNode *Extract = *UI;
- if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
-
- if (Extract->getValueType(0) != MVT::i32)
- return SDValue();
- if (!Extract->hasOneUse())
- return SDValue();
- if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
- Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
- return SDValue();
- if (!isa<ConstantSDNode>(Extract->getOperand(1)))
- return SDValue();
-
- // Record which element was extracted.
- ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
- Uses.push_back(Extract);
- }
-
- // If not all the elements were used, this may not be worthwhile.
- if (ExtractedElements != 15)
- return SDValue();
-
- // Ok, we've now decided to do the transformation.
- // If 64-bit shifts are legal, use the extract-shift sequence,
- // otherwise bounce the vector off the cache.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Vals[4];
-
- if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
- SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
- auto &DL = DAG.getDataLayout();
- EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
- SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
- DAG.getConstant(0, dl, VecIdxTy));
- SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
- DAG.getConstant(1, dl, VecIdxTy));
-
- SDValue ShAmt = DAG.getConstant(
- 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
- Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
- Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
- DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
- Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
- Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
- DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
- } else {
- // Store the value to a temporary stack slot.
- SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
- SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
- MachinePointerInfo());
-
- EVT ElementType = SrcVT.getVectorElementType();
- unsigned EltSize = ElementType.getSizeInBits() / 8;
-
- // Replace each use (extract) with a load of the appropriate element.
- for (unsigned i = 0; i < 4; ++i) {
- uint64_t Offset = EltSize * i;
- auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
- SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
-
- SDValue ScalarAddr =
- DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
-
- // Load the scalar.
- Vals[i] =
- DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
- }
- }
-
- // Replace the extracts
- for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
- UE = Uses.end(); UI != UE; ++UI) {
- SDNode *Extract = *UI;
-
- uint64_t IdxVal = Extract->getConstantOperandVal(1);
- DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
- }
-
- // The replacement was made in place; return N so it won't be revisited.
- return SDValue(N, 0);
+ return SDValue();
}
/// If a vector select has an operand that is -1 or 0, try to simplify the
Modified: llvm/trunk/test/CodeGen/X86/gather-addresses.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/gather-addresses.ll?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/gather-addresses.ll (original)
+++ llvm/trunk/test/CodeGen/X86/gather-addresses.ll Fri Jan 26 09:11:34 2018
@@ -7,21 +7,24 @@
; rdar://7398554
; When doing vector gather-scatter index calculation with 32-bit indices,
-; use an efficient mov/shift sequence rather than shuffling each individual
-; element out of the index vector.
+; minimize shuffling of each individual element out of the index vector.
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
; LIN-SSE2-LABEL: foo:
; LIN-SSE2: # %bb.0:
; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE2-NEXT: pand (%rdx), %xmm0
+; LIN-SSE2-NEXT: movd %xmm0, %eax
+; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; LIN-SSE2-NEXT: movd %xmm1, %ecx
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; LIN-SSE2-NEXT: movq %xmm1, %rax
-; LIN-SSE2-NEXT: movq %xmm0, %rcx
-; LIN-SSE2-NEXT: movslq %ecx, %rdx
-; LIN-SSE2-NEXT: sarq $32, %rcx
-; LIN-SSE2-NEXT: movslq %eax, %rsi
-; LIN-SSE2-NEXT: sarq $32, %rax
+; LIN-SSE2-NEXT: movd %xmm1, %edx
+; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; LIN-SSE2-NEXT: movd %xmm0, %esi
+; LIN-SSE2-NEXT: cltq
+; LIN-SSE2-NEXT: movslq %ecx, %rcx
+; LIN-SSE2-NEXT: movslq %edx, %rdx
+; LIN-SSE2-NEXT: movslq %esi, %rsi
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
@@ -32,14 +35,16 @@ define <4 x double> @foo(double* %p, <4
; LIN-SSE4: # %bb.0:
; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE4-NEXT: pand (%rdx), %xmm0
-; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
-; LIN-SSE4-NEXT: movq %xmm0, %rcx
-; LIN-SSE4-NEXT: movslq %ecx, %rdx
-; LIN-SSE4-NEXT: sarq $32, %rcx
-; LIN-SSE4-NEXT: movslq %eax, %rsi
+; LIN-SSE4-NEXT: movd %xmm0, %eax
+; LIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
+; LIN-SSE4-NEXT: pextrd $2, %xmm0, %edx
+; LIN-SSE4-NEXT: pextrd $3, %xmm0, %esi
+; LIN-SSE4-NEXT: cltq
+; LIN-SSE4-NEXT: movslq %ecx, %rcx
+; LIN-SSE4-NEXT: movslq %edx, %rdx
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; LIN-SSE4-NEXT: sarq $32, %rax
+; LIN-SSE4-NEXT: movslq %esi, %rax
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; LIN-SSE4-NEXT: retq
@@ -48,13 +53,17 @@ define <4 x double> @foo(double* %p, <4
; WIN-SSE2: # %bb.0:
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE2-NEXT: pand (%r8), %xmm0
+; WIN-SSE2-NEXT: movd %xmm0, %r8d
+; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; WIN-SSE2-NEXT: movd %xmm1, %r9d
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; WIN-SSE2-NEXT: movq %xmm1, %rax
-; WIN-SSE2-NEXT: movq %xmm0, %rdx
-; WIN-SSE2-NEXT: movslq %edx, %r8
-; WIN-SSE2-NEXT: sarq $32, %rdx
-; WIN-SSE2-NEXT: movslq %eax, %r9
-; WIN-SSE2-NEXT: sarq $32, %rax
+; WIN-SSE2-NEXT: movd %xmm1, %r10d
+; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; WIN-SSE2-NEXT: movd %xmm0, %edx
+; WIN-SSE2-NEXT: movslq %r8d, %rax
+; WIN-SSE2-NEXT: movslq %r9d, %r8
+; WIN-SSE2-NEXT: movslq %r10d, %r9
+; WIN-SSE2-NEXT: movslq %edx, %rdx
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
@@ -65,14 +74,16 @@ define <4 x double> @foo(double* %p, <4
; WIN-SSE4: # %bb.0:
; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE4-NEXT: pand (%r8), %xmm0
-; WIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
-; WIN-SSE4-NEXT: movq %xmm0, %rdx
-; WIN-SSE4-NEXT: movslq %edx, %r8
-; WIN-SSE4-NEXT: sarq $32, %rdx
-; WIN-SSE4-NEXT: movslq %eax, %r9
+; WIN-SSE4-NEXT: movd %xmm0, %eax
+; WIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
+; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
+; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r9d
+; WIN-SSE4-NEXT: cltq
+; WIN-SSE4-NEXT: movslq %edx, %rdx
+; WIN-SSE4-NEXT: movslq %r8d, %r8
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; WIN-SSE4-NEXT: sarq $32, %rax
+; WIN-SSE4-NEXT: movslq %r9d, %rax
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
; WIN-SSE4-NEXT: retq
@@ -127,22 +138,22 @@ define <4 x i64> @old(double* %p, <4 x i
; LIN-SSE2: # %bb.0:
; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE2-NEXT: pand (%rdx), %xmm0
+; LIN-SSE2-NEXT: movd %xmm0, %eax
+; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; LIN-SSE2-NEXT: movd %xmm1, %edx
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; LIN-SSE2-NEXT: movq %xmm1, %rax
-; LIN-SSE2-NEXT: movq %rax, %rdx
-; LIN-SSE2-NEXT: shrq $32, %rdx
-; LIN-SSE2-NEXT: movq %xmm0, %rsi
-; LIN-SSE2-NEXT: movq %rsi, %rdi
-; LIN-SSE2-NEXT: shrq $32, %rdi
-; LIN-SSE2-NEXT: andl %ecx, %esi
-; LIN-SSE2-NEXT: andl %ecx, %eax
-; LIN-SSE2-NEXT: andq %rcx, %rdi
+; LIN-SSE2-NEXT: movd %xmm1, %esi
+; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; LIN-SSE2-NEXT: movd %xmm0, %edi
+; LIN-SSE2-NEXT: andq %rcx, %rax
; LIN-SSE2-NEXT: andq %rcx, %rdx
-; LIN-SSE2-NEXT: movq %rdi, %xmm1
-; LIN-SSE2-NEXT: movq %rsi, %xmm0
+; LIN-SSE2-NEXT: andq %rcx, %rsi
+; LIN-SSE2-NEXT: andq %rcx, %rdi
+; LIN-SSE2-NEXT: movq %rax, %xmm0
+; LIN-SSE2-NEXT: movq %rdx, %xmm1
; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN-SSE2-NEXT: movq %rdx, %xmm2
-; LIN-SSE2-NEXT: movq %rax, %xmm1
+; LIN-SSE2-NEXT: movq %rdi, %xmm2
+; LIN-SSE2-NEXT: movq %rsi, %xmm1
; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; LIN-SSE2-NEXT: retq
;
@@ -150,21 +161,19 @@ define <4 x i64> @old(double* %p, <4 x i
; LIN-SSE4: # %bb.0:
; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
; LIN-SSE4-NEXT: pand (%rdx), %xmm0
-; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
-; LIN-SSE4-NEXT: movq %rax, %rdx
-; LIN-SSE4-NEXT: shrq $32, %rdx
-; LIN-SSE4-NEXT: movq %xmm0, %rsi
-; LIN-SSE4-NEXT: movq %rsi, %rdi
-; LIN-SSE4-NEXT: shrq $32, %rdi
-; LIN-SSE4-NEXT: andl %ecx, %esi
-; LIN-SSE4-NEXT: andl %ecx, %eax
-; LIN-SSE4-NEXT: andq %rcx, %rdi
+; LIN-SSE4-NEXT: movd %xmm0, %eax
+; LIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
+; LIN-SSE4-NEXT: pextrd $2, %xmm0, %esi
+; LIN-SSE4-NEXT: pextrd $3, %xmm0, %edi
+; LIN-SSE4-NEXT: andq %rcx, %rax
; LIN-SSE4-NEXT: andq %rcx, %rdx
-; LIN-SSE4-NEXT: movq %rdi, %xmm1
-; LIN-SSE4-NEXT: movq %rsi, %xmm0
+; LIN-SSE4-NEXT: andq %rcx, %rsi
+; LIN-SSE4-NEXT: andq %rcx, %rdi
+; LIN-SSE4-NEXT: movq %rdx, %xmm1
+; LIN-SSE4-NEXT: movq %rax, %xmm0
; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN-SSE4-NEXT: movq %rdx, %xmm2
-; LIN-SSE4-NEXT: movq %rax, %xmm1
+; LIN-SSE4-NEXT: movq %rdi, %xmm2
+; LIN-SSE4-NEXT: movq %rsi, %xmm1
; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; LIN-SSE4-NEXT: retq
;
@@ -172,21 +181,21 @@ define <4 x i64> @old(double* %p, <4 x i
; WIN-SSE2: # %bb.0:
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE2-NEXT: pand (%r8), %xmm0
+; WIN-SSE2-NEXT: movd %xmm0, %eax
+; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; WIN-SSE2-NEXT: movd %xmm1, %ecx
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; WIN-SSE2-NEXT: movq %xmm1, %r8
-; WIN-SSE2-NEXT: movq %r8, %rcx
-; WIN-SSE2-NEXT: shrq $32, %rcx
-; WIN-SSE2-NEXT: movq %xmm0, %rax
-; WIN-SSE2-NEXT: movq %rax, %rdx
-; WIN-SSE2-NEXT: shrq $32, %rdx
-; WIN-SSE2-NEXT: andl %r9d, %eax
-; WIN-SSE2-NEXT: andl %r9d, %r8d
-; WIN-SSE2-NEXT: andq %r9, %rdx
+; WIN-SSE2-NEXT: movd %xmm1, %r8d
+; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; WIN-SSE2-NEXT: movd %xmm0, %edx
+; WIN-SSE2-NEXT: andq %r9, %rax
; WIN-SSE2-NEXT: andq %r9, %rcx
-; WIN-SSE2-NEXT: movq %rdx, %xmm1
+; WIN-SSE2-NEXT: andq %r9, %r8
+; WIN-SSE2-NEXT: andq %r9, %rdx
; WIN-SSE2-NEXT: movq %rax, %xmm0
+; WIN-SSE2-NEXT: movq %rcx, %xmm1
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE2-NEXT: movq %rcx, %xmm2
+; WIN-SSE2-NEXT: movq %rdx, %xmm2
; WIN-SSE2-NEXT: movq %r8, %xmm1
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; WIN-SSE2-NEXT: retq
@@ -195,53 +204,47 @@ define <4 x i64> @old(double* %p, <4 x i
; WIN-SSE4: # %bb.0:
; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
; WIN-SSE4-NEXT: pand (%r8), %xmm0
-; WIN-SSE4-NEXT: pextrq $1, %xmm0, %r8
-; WIN-SSE4-NEXT: movq %r8, %rcx
-; WIN-SSE4-NEXT: shrq $32, %rcx
-; WIN-SSE4-NEXT: movq %xmm0, %rax
-; WIN-SSE4-NEXT: movq %rax, %rdx
-; WIN-SSE4-NEXT: shrq $32, %rdx
-; WIN-SSE4-NEXT: andl %r9d, %eax
-; WIN-SSE4-NEXT: andl %r9d, %r8d
-; WIN-SSE4-NEXT: andq %r9, %rdx
+; WIN-SSE4-NEXT: movd %xmm0, %eax
+; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
+; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
+; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx
+; WIN-SSE4-NEXT: andq %r9, %rax
; WIN-SSE4-NEXT: andq %r9, %rcx
-; WIN-SSE4-NEXT: movq %rdx, %xmm1
+; WIN-SSE4-NEXT: andq %r9, %r8
+; WIN-SSE4-NEXT: andq %r9, %rdx
+; WIN-SSE4-NEXT: movq %rcx, %xmm1
; WIN-SSE4-NEXT: movq %rax, %xmm0
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; WIN-SSE4-NEXT: movq %rcx, %xmm2
+; WIN-SSE4-NEXT: movq %rdx, %xmm2
; WIN-SSE4-NEXT: movq %r8, %xmm1
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; WIN-SSE4-NEXT: retq
;
; LIN32-LABEL: old:
; LIN32: # %bb.0:
-; LIN32-NEXT: pushl %ebp
-; LIN32-NEXT: movl %esp, %ebp
+; LIN32-NEXT: pushl %edi
; LIN32-NEXT: pushl %esi
-; LIN32-NEXT: andl $-16, %esp
-; LIN32-NEXT: subl $32, %esp
-; LIN32-NEXT: movl 20(%ebp), %eax
-; LIN32-NEXT: movl 16(%ebp), %ecx
-; LIN32-NEXT: movl 12(%ebp), %edx
-; LIN32-NEXT: movaps (%edx), %xmm0
-; LIN32-NEXT: andps (%ecx), %xmm0
-; LIN32-NEXT: movaps %xmm0, (%esp)
-; LIN32-NEXT: movl (%esp), %ecx
-; LIN32-NEXT: andl %eax, %ecx
+; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
+; LIN32-NEXT: movdqa (%edx), %xmm0
+; LIN32-NEXT: pand (%ecx), %xmm0
+; LIN32-NEXT: movd %xmm0, %ecx
+; LIN32-NEXT: pextrd $1, %xmm0, %edx
+; LIN32-NEXT: pextrd $2, %xmm0, %esi
+; LIN32-NEXT: pextrd $3, %xmm0, %edi
+; LIN32-NEXT: andl %eax, %ecx
; LIN32-NEXT: andl %eax, %edx
-; LIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; LIN32-NEXT: andl %eax, %esi
-; LIN32-NEXT: andl {{[0-9]+}}(%esp), %eax
+; LIN32-NEXT: andl %eax, %edi
; LIN32-NEXT: movd %edx, %xmm1
; LIN32-NEXT: movd %ecx, %xmm0
; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; LIN32-NEXT: movd %eax, %xmm2
+; LIN32-NEXT: movd %edi, %xmm2
; LIN32-NEXT: movd %esi, %xmm1
; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; LIN32-NEXT: leal -4(%ebp), %esp
; LIN32-NEXT: popl %esi
-; LIN32-NEXT: popl %ebp
+; LIN32-NEXT: popl %edi
; LIN32-NEXT: retl
%a = load <4 x i32>, <4 x i32>* %i
%b = load <4 x i32>, <4 x i32>* %h
Modified: llvm/trunk/test/CodeGen/X86/mulvi32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/mulvi32.ll?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/mulvi32.ll (original)
+++ llvm/trunk/test/CodeGen/X86/mulvi32.ll Fri Jan 26 09:11:34 2018
@@ -153,109 +153,51 @@ define <4 x i32> @_mul4xi32b(<4 x i32>,
define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) {
; SSE2-LABEL: _mul4xi32toi64a:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rcx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: shrq $32, %rcx
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: movd %edx, %xmm2
-; SSE2-NEXT: shrq $32, %rdx
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rsi
-; SSE2-NEXT: movd %esi, %xmm3
-; SSE2-NEXT: shrq $32, %rsi
-; SSE2-NEXT: movd %esi, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT: movd %edx, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: shrq $32, %rax
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: movd %eax, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm3
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: retq
;
; SSE42-LABEL: _mul4xi32toi64a:
; SSE42: # %bb.0:
-; SSE42-NEXT: movq %xmm1, %rax
-; SSE42-NEXT: pextrq $1, %xmm1, %rcx
-; SSE42-NEXT: movd %ecx, %xmm1
-; SSE42-NEXT: shrq $32, %rcx
-; SSE42-NEXT: movq %xmm0, %rdx
-; SSE42-NEXT: movd %edx, %xmm2
-; SSE42-NEXT: shrq $32, %rdx
-; SSE42-NEXT: pextrq $1, %xmm0, %rsi
-; SSE42-NEXT: movd %esi, %xmm3
-; SSE42-NEXT: shrq $32, %rsi
-; SSE42-NEXT: movd %esi, %xmm0
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE42-NEXT: movd %edx, %xmm0
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE42-NEXT: movd %ecx, %xmm0
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE42-NEXT: movd %eax, %xmm0
-; SSE42-NEXT: shrq $32, %rax
-; SSE42-NEXT: pmuludq %xmm3, %xmm1
-; SSE42-NEXT: movd %eax, %xmm3
-; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE42-NEXT: pmuludq %xmm2, %xmm0
+; SSE42-NEXT: pxor %xmm3, %xmm3
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE42-NEXT: pmuludq %xmm0, %xmm1
+; SSE42-NEXT: pmuludq %xmm4, %xmm2
+; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: _mul4xi32toi64a:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: vmovq %xmm1, %rcx
-; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: shrq $32, %rdx
-; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
-; AVX1-NEXT: vmovd %esi, %xmm1
-; AVX1-NEXT: shrq $32, %rsi
-; AVX1-NEXT: vmovd %esi, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX1-NEXT: vmovd %edx, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX1-NEXT: vmovd %ecx, %xmm3
-; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %ecx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX1-NEXT: vmovd %eax, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _mul4xi32toi64a:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: vmovd %eax, %xmm2
-; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
-; AVX2-NEXT: vmovq %xmm0, %rdx
-; AVX2-NEXT: vmovd %edx, %xmm1
-; AVX2-NEXT: shrq $32, %rdx
-; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX2-NEXT: vmovd %esi, %xmm0
-; AVX2-NEXT: shrq $32, %rsi
-; AVX2-NEXT: vmovd %esi, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX2-NEXT: vmovd %edx, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; AVX2-NEXT: vmovd %ecx, %xmm3
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vmovd %ecx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
-; AVX2-NEXT: vmovd %eax, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%f00 = extractelement <4 x i32> %0, i32 0
Modified: llvm/trunk/test/CodeGen/X86/pr18344.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr18344.ll?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr18344.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr18344.ll Fri Jan 26 09:11:34 2018
@@ -36,12 +36,14 @@ define void @FFT(%v4_varying_complex* no
; X64: # %bb.0: # %begin
; X64-NEXT: movdqu (%rdx), %xmm0
; X64-NEXT: pslld $4, %xmm0
-; X64-NEXT: movq %xmm0, %rax
+; X64-NEXT: movd %xmm0, %eax
; X64-NEXT: movslq %eax, %r8
-; X64-NEXT: sarq $32, %rax
-; X64-NEXT: pextrq $1, %xmm0, %rdx
-; X64-NEXT: movslq %edx, %rcx
-; X64-NEXT: sarq $32, %rdx
+; X64-NEXT: pextrd $1, %xmm0, %ecx
+; X64-NEXT: movslq %ecx, %rcx
+; X64-NEXT: pextrd $2, %xmm0, %edx
+; X64-NEXT: movslq %edx, %rdx
+; X64-NEXT: pextrd $3, %xmm0, %eax
+; X64-NEXT: cltq
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
Modified: llvm/trunk/test/CodeGen/X86/pr21792.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr21792.ll?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr21792.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr21792.ll Fri Jan 26 09:11:34 2018
@@ -12,19 +12,16 @@ define void @func(<4 x float> %vx) {
; CHECK-NEXT: pushq %rax
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pextrq $1, %xmm0, %rax
-; CHECK-NEXT: movzwl %ax, %ecx
-; CHECK-NEXT: shrq $32, %rax
-; CHECK-NEXT: movq %xmm0, %rdx
-; CHECK-NEXT: movzwl %dx, %r8d
-; CHECK-NEXT: movq %rdx, %r9
-; CHECK-NEXT: shrq $32, %r9
+; CHECK-NEXT: movd %xmm0, %r8d
; CHECK-NEXT: leaq stuff(%r8), %rdi
-; CHECK-NEXT: leaq stuff(%r9), %rsi
-; CHECK-NEXT: leaq stuff(%rcx), %rdx
-; CHECK-NEXT: leaq stuff(%rax), %rcx
+; CHECK-NEXT: pextrd $1, %xmm0, %eax
+; CHECK-NEXT: leaq stuff(%rax), %rsi
+; CHECK-NEXT: pextrd $2, %xmm0, %edx
+; CHECK-NEXT: pextrd $3, %xmm0, %ecx
+; CHECK-NEXT: leaq stuff(%rdx), %rdx
+; CHECK-NEXT: leaq stuff(%rcx), %rcx
; CHECK-NEXT: leaq stuff+8(%r8), %r8
-; CHECK-NEXT: leaq stuff+8(%r9), %r9
+; CHECK-NEXT: leaq stuff+8(%rax), %r9
; CHECK-NEXT: callq toto
; CHECK-NEXT: popq %rax
; CHECK-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/var-permute-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/var-permute-128.ll?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/var-permute-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/var-permute-128.ll Fri Jan 26 09:11:34 2018
@@ -37,44 +37,42 @@ define <2 x i64> @var_shuffle_v2i64(<2 x
define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v4i32:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd %xmm1, %eax
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSSE3-NEXT: movd %xmm2, %ecx
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %rax
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: sarq $32, %rcx
-; SSSE3-NEXT: movq %xmm1, %rdx
-; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: sarq $32, %rsi
-; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: movd %xmm2, %edx
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %esi
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: andl $3, %eax
; SSSE3-NEXT: andl $3, %ecx
+; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrq $1, %xmm1, %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: sarq $32, %rcx
-; AVX-NEXT: vmovq %xmm1, %rdx
-; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: sarq $32, %rsi
-; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX-NEXT: vpextrd $2, %xmm1, %edx
+; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $3, %esi
; AVX-NEXT: andl $3, %eax
; AVX-NEXT: andl $3, %ecx
+; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: andl $3, %esi
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
-; AVX-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
-; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
; AVX-NEXT: retq
%index0 = extractelement <4 x i32> %indices, i32 0
%index1 = extractelement <4 x i32> %indices, i32 1
@@ -287,40 +285,38 @@ define <2 x double> @var_shuffle_v2f64(<
define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
; SSSE3-LABEL: var_shuffle_v4f32:
; SSSE3: # %bb.0:
+; SSSE3-NEXT: movd %xmm1, %eax
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
+; SSSE3-NEXT: movd %xmm2, %ecx
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSSE3-NEXT: movq %xmm2, %rax
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: sarq $32, %rcx
-; SSSE3-NEXT: movq %xmm1, %rdx
-; SSSE3-NEXT: movq %rdx, %rsi
-; SSSE3-NEXT: sarq $32, %rsi
-; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: movd %xmm2, %edx
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSSE3-NEXT: movd %xmm1, %esi
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: andl $3, %eax
; SSSE3-NEXT: andl $3, %ecx
+; SSSE3-NEXT: andl $3, %edx
+; SSSE3-NEXT: andl $3, %esi
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; AVX-LABEL: var_shuffle_v4f32:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrq $1, %xmm1, %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: sarq $32, %rcx
-; AVX-NEXT: vmovq %xmm1, %rdx
-; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: sarq $32, %rsi
-; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX-NEXT: vpextrd $2, %xmm1, %edx
+; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: andl $3, %esi
; AVX-NEXT: andl $3, %eax
; AVX-NEXT: andl $3, %ecx
+; AVX-NEXT: andl $3, %edx
+; AVX-NEXT: andl $3, %esi
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
Modified: llvm/trunk/test/CodeGen/X86/var-permute-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/var-permute-256.ll?rev=323541&r1=323540&r2=323541&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/var-permute-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/var-permute-256.ll Fri Jan 26 09:11:34 2018
@@ -119,36 +119,32 @@ define <8 x i32> @var_shuffle_v8i32(<8 x
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: vpextrq $1, %xmm1, %r8
-; AVX1-NEXT: movq %r8, %rcx
-; AVX1-NEXT: shrq $30, %rcx
-; AVX1-NEXT: vmovq %xmm1, %r9
-; AVX1-NEXT: movq %r9, %rsi
-; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: vmovd %xmm1, %r8d
+; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
+; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
+; AVX1-NEXT: vpextrd $3, %xmm1, %esi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm1, %r10
-; AVX1-NEXT: movq %r10, %rdi
-; AVX1-NEXT: shrq $30, %rdi
-; AVX1-NEXT: vmovq %xmm1, %rax
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: vmovd %xmm1, %edi
+; AVX1-NEXT: vpextrd $1, %xmm1, %eax
+; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
+; AVX1-NEXT: vpextrd $3, %xmm1, %edx
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: andl $7, %r9d
-; AVX1-NEXT: andl $28, %esi
; AVX1-NEXT: andl $7, %r8d
-; AVX1-NEXT: andl $28, %ecx
-; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: andl $28, %edx
+; AVX1-NEXT: andl $7, %r9d
; AVX1-NEXT: andl $7, %r10d
-; AVX1-NEXT: andl $28, %edi
+; AVX1-NEXT: andl $7, %esi
+; AVX1-NEXT: andl $7, %edi
+; AVX1-NEXT: andl $7, %eax
+; AVX1-NEXT: andl $7, %ecx
+; AVX1-NEXT: andl $7, %edx
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $1, (%rsp,%rax,4), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $2, (%rsp,%rcx,4), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $3, (%rsp,%rdx,4), %xmm0, %xmm0
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1
-; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
-; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $1, (%rsp,%r9,4), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: movq %rbp, %rsp
; AVX1-NEXT: popq %rbp
@@ -1212,28 +1208,24 @@ define <8 x float> @var_shuffle_v8f32(<8
; AVX1-NEXT: movq %rsp, %rbp
; AVX1-NEXT: andq $-32, %rsp
; AVX1-NEXT: subq $64, %rsp
-; AVX1-NEXT: vpextrq $1, %xmm1, %r8
-; AVX1-NEXT: movq %r8, %rcx
-; AVX1-NEXT: shrq $30, %rcx
-; AVX1-NEXT: vmovq %xmm1, %r9
-; AVX1-NEXT: movq %r9, %rdx
-; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: vmovd %xmm1, %esi
+; AVX1-NEXT: vpextrd $1, %xmm1, %r8d
+; AVX1-NEXT: vpextrd $2, %xmm1, %r9d
+; AVX1-NEXT: vpextrd $3, %xmm1, %r10d
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm1, %r10
-; AVX1-NEXT: movq %r10, %rdi
-; AVX1-NEXT: shrq $30, %rdi
-; AVX1-NEXT: vmovq %xmm1, %rax
-; AVX1-NEXT: movq %rax, %rsi
-; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: vmovd %xmm1, %edx
+; AVX1-NEXT: vpextrd $1, %xmm1, %edi
+; AVX1-NEXT: vpextrd $2, %xmm1, %eax
+; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
-; AVX1-NEXT: andl $7, %r9d
-; AVX1-NEXT: andl $28, %edx
+; AVX1-NEXT: andl $7, %esi
; AVX1-NEXT: andl $7, %r8d
-; AVX1-NEXT: andl $28, %ecx
-; AVX1-NEXT: andl $7, %eax
-; AVX1-NEXT: andl $28, %esi
+; AVX1-NEXT: andl $7, %r9d
; AVX1-NEXT: andl $7, %r10d
-; AVX1-NEXT: andl $28, %edi
+; AVX1-NEXT: andl $7, %edx
+; AVX1-NEXT: andl $7, %edi
+; AVX1-NEXT: andl $7, %eax
+; AVX1-NEXT: andl $7, %ecx
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -1375,36 +1367,32 @@ define <4 x i64> @var_shuffle_v4i64_from
define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpextrq $1, %xmm1, %r8
-; AVX1-NEXT: movq %r8, %r10
-; AVX1-NEXT: shrq $30, %r10
-; AVX1-NEXT: vmovq %xmm1, %r9
-; AVX1-NEXT: movq %r9, %rsi
-; AVX1-NEXT: shrq $30, %rsi
+; AVX1-NEXT: vmovd %xmm1, %r8d
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: andl $3, %r9d
-; AVX1-NEXT: andl $12, %esi
; AVX1-NEXT: andl $3, %r8d
-; AVX1-NEXT: andl $12, %r10d
+; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
+; AVX1-NEXT: andl $3, %r9d
+; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
+; AVX1-NEXT: andl $3, %r10d
+; AVX1-NEXT: vpextrd $3, %xmm1, %esi
+; AVX1-NEXT: andl $3, %esi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movq %rax, %rdi
-; AVX1-NEXT: shrq $30, %rdi
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rdx
-; AVX1-NEXT: shrq $30, %rdx
-; AVX1-NEXT: andl $3, %ecx
-; AVX1-NEXT: andl $12, %edx
+; AVX1-NEXT: vmovd %xmm0, %edi
+; AVX1-NEXT: andl $3, %edi
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: andl $3, %eax
-; AVX1-NEXT: andl $12, %edi
+; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: vpextrd $3, %xmm0, %edx
+; AVX1-NEXT: andl $3, %edx
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
-; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rax,4), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rcx,4), %xmm0, %xmm0
+; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdx,4), %xmm0, %xmm0
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1
-; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1
-; AVX1-NEXT: vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $1, -24(%rsp,%r9,4), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r10,4), %xmm1, %xmm1
+; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -2402,28 +2390,24 @@ define <4 x double> @var_shuffle_v4f64_f
define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpextrq $1, %xmm1, %r8
-; AVX1-NEXT: movq %r8, %r10
-; AVX1-NEXT: shrq $30, %r10
-; AVX1-NEXT: vmovq %xmm1, %r9
-; AVX1-NEXT: movq %r9, %rdx
-; AVX1-NEXT: shrq $30, %rdx
+; AVX1-NEXT: vmovd %xmm1, %r8d
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: andl $3, %r9d
-; AVX1-NEXT: andl $12, %edx
; AVX1-NEXT: andl $3, %r8d
-; AVX1-NEXT: andl $12, %r10d
+; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
+; AVX1-NEXT: andl $3, %r9d
+; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
+; AVX1-NEXT: andl $3, %r10d
+; AVX1-NEXT: vpextrd $3, %xmm1, %esi
+; AVX1-NEXT: andl $3, %esi
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: movq %rax, %rdi
-; AVX1-NEXT: shrq $30, %rdi
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rcx, %rsi
-; AVX1-NEXT: shrq $30, %rsi
-; AVX1-NEXT: andl $3, %ecx
-; AVX1-NEXT: andl $12, %esi
+; AVX1-NEXT: vmovd %xmm0, %edi
+; AVX1-NEXT: andl $3, %edi
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: andl $3, %eax
-; AVX1-NEXT: andl $12, %edi
+; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
+; AVX1-NEXT: andl $3, %ecx
+; AVX1-NEXT: vpextrd $3, %xmm0, %edx
+; AVX1-NEXT: andl $3, %edx
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
@@ -2475,19 +2459,17 @@ define <4 x i32> @var_shuffle_v4i32_from
; AVX-NEXT: movq %rsp, %rbp
; AVX-NEXT: andq $-32, %rsp
; AVX-NEXT: subq $64, %rsp
-; AVX-NEXT: vmovq %xmm1, %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shrq $30, %rcx
-; AVX-NEXT: andl $28, %ecx
-; AVX-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX-NEXT: movq %rdx, %rsi
-; AVX-NEXT: sarq $32, %rsi
+; AVX-NEXT: vmovd %xmm1, %eax
+; AVX-NEXT: vmovaps %ymm0, (%rsp)
; AVX-NEXT: andl $7, %eax
+; AVX-NEXT: vpextrd $1, %xmm1, %ecx
+; AVX-NEXT: andl $7, %ecx
+; AVX-NEXT: vpextrd $2, %xmm1, %edx
; AVX-NEXT: andl $7, %edx
-; AVX-NEXT: vmovaps %ymm0, (%rsp)
+; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: andl $7, %esi
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0
+; AVX-NEXT: vpinsrd $1, (%rsp,%rcx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
; AVX-NEXT: movq %rbp, %rsp
More information about the llvm-commits
mailing list