[llvm] r290064 - [X86][SSE] Add support for combining target shuffles to SHUFPS.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 18 06:26:03 PST 2016
Author: rksimon
Date: Sun Dec 18 08:26:02 2016
New Revision: 290064
URL: http://llvm.org/viewvc/llvm-project?rev=290064&view=rev
Log:
[X86][SSE] Add support for combining target shuffles to SHUFPS.
As discussed on D27692, the next step will be to allow cross-domain shuffles once the combined shuffle depth passes a certain point.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
llvm/trunk/test/CodeGen/X86/extractelement-load.ll
llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll
llvm/trunk/test/CodeGen/X86/oddshuffles.ll
llvm/trunk/test/CodeGen/X86/pr30511.ll
llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll
llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Dec 18 08:26:02 2016
@@ -7779,6 +7779,42 @@ is256BitLaneRepeatedShuffleMask(MVT VT,
return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
}
+/// Test whether a target shuffle mask is equivalent within each sub-lane.
+/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
+static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
+ ArrayRef<int> Mask,
+ SmallVectorImpl<int> &RepeatedMask) {
+ int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
+ RepeatedMask.assign(LaneSize, SM_SentinelUndef);
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i) {
+ assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
+ if (Mask[i] == SM_SentinelUndef)
+ continue;
+ if (Mask[i] == SM_SentinelZero) {
+ if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
+ return false;
+ RepeatedMask[i % LaneSize] = SM_SentinelZero;
+ continue;
+ }
+ if ((Mask[i] % Size) / LaneSize != i / LaneSize)
+ // This entry crosses lanes, so there is no way to model this shuffle.
+ return false;
+
+ // Ok, handle the in-lane shuffles by detecting if and when they repeat.
+ // Adjust second vector indices to start at LaneSize instead of Size.
+ int LocalM =
+ Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
+ if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
+ // This is the first non-undef entry in this slot of a 128-bit lane.
+ RepeatedMask[i % LaneSize] = LocalM;
+ else if (RepeatedMask[i % LaneSize] != LocalM)
+ // Found a mismatch with the repeated mask.
+ return false;
+ }
+ return true;
+}
+
/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
/// arguments.
///
@@ -26274,6 +26310,50 @@ static bool matchBinaryPermuteVectorShuf
}
}
+ // Attempt to combine to SHUFPS.
+ if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
+ (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+ SmallVector<int, 4> RepeatedMask;
+ if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+ auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
+ int M0 = RepeatedMask[Offset];
+ int M1 = RepeatedMask[Offset + 1];
+
+ if (isUndefInRange(RepeatedMask, Offset, 2)) {
+ return DAG.getUNDEF(MaskVT);
+ } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : 0);
+ S1 = (SM_SentinelUndef == M1 ? -1 : 1);
+ return getZeroVector(MaskVT, Subtarget, DAG, DL);
+ } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V1;
+ } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
+ S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
+ S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
+ return V2;
+ }
+
+ return SDValue();
+ };
+
+ int ShufMask[4] = {-1, -1, -1, -1};
+ SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
+ SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
+
+ if (Lo && Hi) {
+ V1 = Lo;
+ V2 = Hi;
+ Shuffle = X86ISD::SHUFP;
+ ShuffleVT = MaskVT;
+ PermuteImm = getV4X86ShuffleImm(ShufMask);
+ return true;
+ }
+ }
+ }
+
return false;
}
@@ -27294,7 +27374,8 @@ static SDValue combineTargetShuffle(SDVa
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
- switch (N.getOpcode()) {
+ unsigned Opcode = N.getOpcode();
+ switch (Opcode) {
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
case X86ISD::PSHUFHW:
@@ -27369,6 +27450,31 @@ static SDValue combineTargetShuffle(SDVa
return SDValue();
}
+ case X86ISD::MOVSD:
+ case X86ISD::MOVSS: {
+ bool isFloat = VT.isFloatingPoint();
+ SDValue V0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue V1 = peekThroughBitcasts(N->getOperand(1));
+ bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+ bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
+ bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
+ bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
+ assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+
+ // We often lower to MOVSD/MOVSS from integer as well as native float
+ // types; remove unnecessary domain-crossing bitcasts if we can to make it
+ // easier to combine shuffles later on. We've already accounted for the
+ // domain switching cost when we decided to lower with it.
+ if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
+ MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
+ : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
+ V0 = DAG.getBitcast(NewVT, V0);
+ V1 = DAG.getBitcast(NewVT, V1);
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
+ }
+
+ return SDValue();
+ }
case X86ISD::INSERTPS: {
assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
SDValue Op0 = N.getOperand(0);
@@ -28275,7 +28381,7 @@ static SDValue combineVSelectWithAllOnes
return SDValue();
bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
- // Check if the first operand is all zeros.This situation only
+ // Check if the first operand is all zeros.This situation only
// applies to avx512.
if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
//Invert the cond to not(cond) : xor(op,allones)=not(op)
Modified: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll Sun Dec 18 08:26:02 2016
@@ -84,8 +84,7 @@ define <2 x float> @sltof2f32(<2 x i64>
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; KNL-NEXT: retq
;
; SKX-LABEL: sltof2f32:
Modified: llvm/trunk/test/CodeGen/X86/extractelement-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/extractelement-load.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/extractelement-load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/extractelement-load.ll Sun Dec 18 08:26:02 2016
@@ -89,7 +89,7 @@ define i64 @t4(<2 x double>* %a) {
; X32-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; X32-SSE2-NEXT: movd %xmm1, %eax
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; X32-SSE2-NEXT: movd %xmm0, %edx
; X32-SSE2-NEXT: retl
;
Modified: llvm/trunk/test/CodeGen/X86/lower-bitcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/lower-bitcast.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/lower-bitcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/lower-bitcast.ll Sun Dec 18 08:26:02 2016
@@ -9,7 +9,7 @@
define double @test1(double %A) {
; CHECK-LABEL: test1:
; CHECK: # BB#0:
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-128.ll Sun Dec 18 08:26:02 2016
@@ -1104,39 +1104,26 @@ define <4 x float> @merge_4f32_f32_2345_
define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4f32_f32_X0YY:
; SSE: # BB#0:
-; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; SSE-NEXT: retq
;
; AVX-LABEL: merge_4f32_f32_X0YY:
; AVX: # BB#0:
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
; AVX-NEXT: retq
;
-; X32-SSE1-LABEL: merge_4f32_f32_X0YY:
-; X32-SSE1: # BB#0:
-; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE1-NEXT: retl
-;
-; X32-SSE41-LABEL: merge_4f32_f32_X0YY:
-; X32-SSE41: # BB#0:
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE41-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; X32-SSE41-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE41-NEXT: retl
+; X32-SSE-LABEL: merge_4f32_f32_X0YY:
+; X32-SSE: # BB#0:
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
+; X32-SSE-NEXT: retl
%val0 = load float, float* %ptr0, align 4
%val1 = load float, float* %ptr1, align 4
%res0 = insertelement <4 x float> undef, float %val0, i32 0
Modified: llvm/trunk/test/CodeGen/X86/oddshuffles.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/oddshuffles.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/oddshuffles.ll (original)
+++ llvm/trunk/test/CodeGen/X86/oddshuffles.ll Sun Dec 18 08:26:02 2016
@@ -1100,48 +1100,46 @@ define void @interleave_24i16_in(<24 x i
define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind {
; SSE2-LABEL: interleave_24i32_out:
; SSE2: # BB#0:
-; SSE2-NEXT: movdqu 80(%rdi), %xmm8
-; SSE2-NEXT: movups 64(%rdi), %xmm10
+; SSE2-NEXT: movups 80(%rdi), %xmm5
+; SSE2-NEXT: movups 64(%rdi), %xmm8
; SSE2-NEXT: movups (%rdi), %xmm0
-; SSE2-NEXT: movups 16(%rdi), %xmm7
-; SSE2-NEXT: movdqu 32(%rdi), %xmm9
-; SSE2-NEXT: movups 48(%rdi), %xmm2
-; SSE2-NEXT: movaps %xmm2, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm3[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,0]
-; SSE2-NEXT: movaps %xmm0, %xmm5
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm7[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm5[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,2,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,0,1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,0,3]
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
-; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm6[0],xmm1[1]
+; SSE2-NEXT: movups 16(%rdi), %xmm6
+; SSE2-NEXT: movups 32(%rdi), %xmm2
+; SSE2-NEXT: movups 48(%rdi), %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm8[2,3]
+; SSE2-NEXT: movaps %xmm5, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0]
+; SSE2-NEXT: movaps %xmm0, %xmm4
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[2,3]
+; SSE2-NEXT: movaps %xmm2, %xmm7
+; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm4[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,3,0,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm8[3,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,1,0,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm10[0],xmm9[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; SSE2-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
; SSE2-NEXT: movups %xmm3, 16(%rsi)
-; SSE2-NEXT: movups %xmm5, (%rsi)
-; SSE2-NEXT: movups %xmm2, 16(%rdx)
+; SSE2-NEXT: movups %xmm4, (%rsi)
+; SSE2-NEXT: movups %xmm1, 16(%rdx)
; SSE2-NEXT: movups %xmm0, (%rdx)
-; SSE2-NEXT: movupd %xmm1, 16(%rcx)
-; SSE2-NEXT: movupd %xmm4, (%rcx)
+; SSE2-NEXT: movupd %xmm7, 16(%rcx)
+; SSE2-NEXT: movupd %xmm9, (%rcx)
; SSE2-NEXT: retq
;
; SSE42-LABEL: interleave_24i32_out:
Modified: llvm/trunk/test/CodeGen/X86/pr30511.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr30511.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr30511.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr30511.ll Sun Dec 18 08:26:02 2016
@@ -8,7 +8,7 @@ define i64 @PR30511(<2 x double> %a) {
; CHECK-LABEL: PR30511:
; CHECK: # BB#0:
; CHECK-NEXT: addpd {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: cvtdq2pd %xmm0, %xmm0
; CHECK-NEXT: mulpd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movd %xmm0, %rax
Modified: llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_extract-mmx.ll Sun Dec 18 08:26:02 2016
@@ -16,8 +16,8 @@ define i32 @test0(<1 x i64>* %v4) nounwi
; X32-NEXT: movl %ecx, (%esp)
; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: addl $32, %eax
; X32-NEXT: movl %ebp, %esp
@@ -55,8 +55,8 @@ define i32 @test1(i32* nocapture readonl
; X32-NEXT: movd (%eax), %mm0
; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: emms
; X32-NEXT: movl %ebp, %esp
@@ -98,8 +98,8 @@ define i32 @test2(i32* nocapture readonl
; X32-NEXT: movl 8(%ebp), %eax
; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3]
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: emms
; X32-NEXT: movl %ebp, %esp
@@ -149,8 +149,8 @@ define i32 @test4(x86_mmx %a) nounwind {
; X32-NEXT: andl $-8, %esp
; X32-NEXT: subl $8, %esp
; X32-NEXT: movq %mm0, (%esp)
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,0,1]
; X32-NEXT: movd %xmm0, %eax
; X32-NEXT: movl %ebp, %esp
; X32-NEXT: popl %ebp
Modified: llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_int_to_fp.ll Sun Dec 18 08:26:02 2016
@@ -1059,8 +1059,7 @@ define <4 x float> @sitofp_2i64_to_4f32(
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_2i64_to_4f32:
@@ -1071,8 +1070,7 @@ define <4 x float> @sitofp_2i64_to_4f32(
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_2i64_to_4f32:
@@ -1083,8 +1081,7 @@ define <4 x float> @sitofp_2i64_to_4f32(
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
@@ -1185,8 +1182,7 @@ define <4 x float> @sitofp_4i64_to_4f32_
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
@@ -1197,8 +1193,7 @@ define <4 x float> @sitofp_4i64_to_4f32_
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
@@ -1209,8 +1204,7 @@ define <4 x float> @sitofp_4i64_to_4f32_
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
@@ -1673,8 +1667,7 @@ define <4 x float> @uitofp_2i64_to_4f32(
; VEX-NEXT: # BB#7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB39_8:
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_2i64_to_4f32:
@@ -1685,8 +1678,7 @@ define <4 x float> @uitofp_2i64_to_4f32(
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_2i64_to_4f32:
@@ -1697,8 +1689,7 @@ define <4 x float> @uitofp_2i64_to_4f32(
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
@@ -1911,8 +1902,7 @@ define <4 x float> @uitofp_4i64_to_4f32_
; VEX-NEXT: # BB#7:
; VEX-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm1
; VEX-NEXT: .LBB41_8:
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; VEX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; VEX-NEXT: retq
;
; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
@@ -1923,8 +1913,7 @@ define <4 x float> @uitofp_4i64_to_4f32_
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512F-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
@@ -1935,8 +1924,7 @@ define <4 x float> @uitofp_4i64_to_4f32_
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm0
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
; AVX512VL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
; AVX512VL-NEXT: retq
;
; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v4.ll Sun Dec 18 08:26:02 2016
@@ -1847,16 +1847,20 @@ define <4 x i32> @shuffle_v4i32_bitcast_
define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4f32_bitcast_4401:
; SSE: # BB#0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movapd %xmm1, %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
+; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: shuffle_v4f32_bitcast_4401:
-; AVX: # BB#0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v4f32_bitcast_4401:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v4f32_bitcast_4401:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: retq
%1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
%2 = bitcast <4 x i32> %1 to <2 x double>
%3 = bitcast <4 x float> %a to <2 x double>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll Sun Dec 18 08:26:02 2016
@@ -75,12 +75,16 @@ define <4 x float> @combine_vpermil2ps_i
define <4 x float> @combine_vpermil2ps_1z74(<4 x float> %a0, <4 x float> %a1) {
; X32-LABEL: combine_vpermil2ps_1z74:
; X32: # BB#0:
-; X32-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0]
+; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X32-NEXT: retl
;
; X64-LABEL: combine_vpermil2ps_1z74:
; X64: # BB#0:
-; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[1],zero,xmm1[3,0]
+; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[3,0]
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; X64-NEXT: retq
%res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 1, i32 1, i32 7, i32 4>, i8 0)
%res1 = shufflevector <4 x float> %res0, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll?rev=290064&r1=290063&r2=290064&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-mmx.ll Sun Dec 18 08:26:02 2016
@@ -8,9 +8,9 @@ define void @test0(<1 x i64>* %x) {
; X32-LABEL: test0:
; X32: ## BB#0: ## %entry
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-NEXT: movq %xmm0, (%eax)
+; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X32-NEXT: movlps %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test0:
More information about the llvm-commits
mailing list