[llvm] r352333 - [x86] add restriction for lowering to vpermps
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 27 13:53:33 PST 2019
Author: spatel
Date: Sun Jan 27 13:53:33 2019
New Revision: 352333
URL: http://llvm.org/viewvc/llvm-project?rev=352333&view=rev
Log:
[x86] add restriction for lowering to vpermps
This transform was added with rL351346, and we had
an escape for shufps, but we also want one for
unpckps vs. vpermps because vpermps doesn't take
an immediate shuffle index operand.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=352333&r1=352332&r2=352333&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Jan 27 13:53:33 2019
@@ -9719,6 +9719,21 @@ static bool isUnpackWdShuffleMask(ArrayR
return IsUnpackwdMask;
}
+static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask) {
+ // Create 128-bit vector type based on mask size.
+ MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
+ MVT VT = MVT::getVectorVT(EltVT, Mask.size());
+
+ // Match any of unary/binary or low/high.
+ for (unsigned i = 0; i != 4; ++i) {
+ SmallVector<int, 16> UnpackMask;
+ createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
+ if (isTargetShuffleEquivalent(Mask, UnpackMask))
+ return true;
+ }
+ return false;
+}
+
/// Get a 4-lane 8-bit shuffle immediate for a mask.
///
/// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -11709,8 +11724,10 @@ static SDValue lowerShuffleOfExtractsAsV
return SDValue();
// Final bailout: if the mask is simple, we are better off using an extract
- // and a simple narrow shuffle.
- if (NumElts == 4 && isSingleSHUFPSMask(NewMask))
+ // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
+ // because that avoids a constant load from memory.
+ if (NumElts == 4 &&
+ (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask)))
return SDValue();
// Extend the shuffle mask with undef elements.
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll?rev=352333&r1=352332&r2=352333&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-unpck.ll Sun Jan 27 13:53:33 2019
@@ -45,23 +45,15 @@ define <2 x double> @unpckh_unary_extrac
ret <2 x double> %r
}
-; FIXME: vpermps requires a constant load for the index op. It's unlikely to be profitable.
+; vpermps requires a constant load for the index op. It's unlikely to be profitable.
define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
-; AVX1-LABEL: unpckh_unary_extracted_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8i32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckh_unary_extracted_v8i32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -69,20 +61,12 @@ define <4 x i32> @unpckh_unary_extracted
}
define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
-; AVX1-LABEL: unpckh_unary_extracted_v8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,3,7,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckh_unary_extracted_v8f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -169,23 +153,15 @@ define <2 x double> @unpckl_unary_extrac
ret <2 x double> %r
}
-; FIXME: vpermps requires a constant load for the index op. It's unlikely to be profitable.
+; vpermps requires a constant load for the index op. It's unlikely to be profitable.
define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
-; AVX1-LABEL: unpckl_unary_extracted_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8i32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,4,1,5,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckl_unary_extracted_v8i32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -193,20 +169,12 @@ define <4 x i32> @unpckl_unary_extracted
}
define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
-; AVX1-LABEL: unpckl_unary_extracted_v8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f32:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = <0,4,1,5,u,u,u,u>
-; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2OR512VL-NEXT: vzeroupper
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: unpckl_unary_extracted_v8f32:
+; ALL: # %bb.0:
+; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
+; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
More information about the llvm-commits
mailing list